import {AssemblyInstructionInfo} from '../base.js';

    export function getAsmOpcode(opcode: string | undefined): AssemblyInstructionInfo | undefined {
        if (!opcode) return;
        switch (opcode) {
            case "abs":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs\" target=\"_blank\" rel=\"noopener noreferrer\">abs(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs\" target=\"_blank\" rel=\"noopener noreferrer\">abs(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs\" target=\"_blank\" rel=\"noopener noreferrer\">abs(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: abs</h1><section id=\"floating-point-instructions-abs\">\n\n\n<p>Absolute value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>abs{.ftz}.f32  d, a;\nabs.f64        d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Take the absolute value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and store the result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = |a|;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">abs.f32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> input yields unspecified <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>. For <code class=\"docutils literal notranslate\"><span class=\"pre\">abs.f64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> input is passed\nthrough unchanged. Future implementations may comply with the IEEE 754 standard by preserving\npayload and modifying only the sign bit.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>abs.ftz.f32  x,f0;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: abs</h1><section id=\"half-precision-floating-point-instructions-abs\">\n\n\n<p>Absolute value</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>abs{.ftz}.f16    d, a;\nabs{.ftz}.f16x2  d, a;\nabs.bf16         d, a;\nabs.bf16x2       d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Take absolute value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and store the result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, forms input vector by extracting half word values\nfrom the source operand. Absolute values of half-word operands are then computed in parallel to\nproduce <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> result in destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction\ntype, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    d = |a|;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = |fA[i]|;\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<dl class=\"simple\">\n<dt>Subnormal numbers:</dt>\n<dd>\n<p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">abs.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> inputs yield an unspecified <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>. Future implementations may comply with the IEEE 754\nstandard by preserving payload and modifying only the sign bit.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.5.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">abs.bf16x2</span></code> introduced in PTX ISA 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">abs.bf16x2</span></code> requires architecture <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>abs.ftz.f16  x,f0;\nabs.bf16     x,b0;\nabs.bf16x2   x1,b1;\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: abs</h1><section id=\"integer-arithmetic-instructions-abs\">\n\n\n<p>Absolute value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>abs.type  d, a;\n\n.type = { .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Take the absolute value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and store it in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = |a|;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Only for signed integers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>abs.s32  r0,a;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: abs\n\n\n\nAbsolute value.\n\nSyntax\n\nabs{.ftz}.f32  d, a;\n\nabs.f64        d, a;\n\nDescription\n\nTake the absolute value of a and store the result in d.\n\nSemantics\n\nd = |a|;\n\nNotes\n\nSubnormal numbers:\n\nsm_20+\n\nBy default, subnormal numbers are supported.\n\nabs.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1x\n\nabs.f64 supports subnormal numbers.\n\nabs.f32 flushes subnormal inputs and results to sign-pr...\n\n=====Half Precision Floating Point Instructions: abs\n\n\n\nAbsolute value\n\nSyntax\n\nabs{.ftz}.f16    d, a;\n\nabs{.ftz}.f16x2  d, a;\n\nabs.bf16         d, a;\n\nabs.bf16x2       d, a;\n\nDescription\n\nTake absolute value of a and store the result in d.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vector by extracting half word values\n\nfrom the source operand. Absolute values of half-word operands are then computed in parallel to\n\nproduce .f16x2 or .bf16x2 result in...\n\n=====Integer Arithmetic Instructions: abs\n\n\n\nAbsolute value.\n\nSyntax\n\nabs.type  d, a;\n\n.type = { .s16, .s32, .s64 };\n\nDescription\n\nTake the absolute value of a and store it in d.\n\nSemantics\n\nd = |a|;\n\nNotes\n\nOnly for signed integers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nabs.s32  r0,a;\n\n... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs"
            };

        case "activemask":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask\" target=\"_blank\" rel=\"noopener noreferrer\">activemask <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: activemask</h1><section id=\"parallel-synchronization-and-communication-instructions-activemask\">\n\n\n<p>Queries the active threads within a warp.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>activemask.b32 d;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">activemask</span></code> queries predicated-on active threads from the executing warp and sets the destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> with 32-bit integer mask where bit position in the mask corresponds to the thread\u2019s\n<code class=\"docutils literal notranslate\"><span class=\"pre\">laneid</span></code>.</p>\n<p>Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is a 32-bit destination register.</p>\n<p>An active thread will contribute 1 for its entry in the result and exited or inactive or\npredicated-off thread will contribute 0 for its entry in the result.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>activemask.b32  %r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Queries the active threads within a warp.\n\nSyntax\n\nactivemask.b32 d;\n\nDescription\n\nactivemask queries predicated-on active threads from the executing warp and sets the destination\n\nd with 32-bit integer mask where bit position in the mask corresponds to the thread\u2019s\n\nlaneid.\n\nDestination d is a 32-bit destination register.\n\nAn active thread will contribute 1 for its entry in the result and exited or inactive or\n\npredicated-off thread will contribute 0 for its entry in the result.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.2.\n\nTarget ISA Notes\n\nRequires sm_30 or higher.\n\nExamples\n\nactivemask.b32  %r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask"
            };

        case "add":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add\" target=\"_blank\" rel=\"noopener noreferrer\">add(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add\" target=\"_blank\" rel=\"noopener noreferrer\">add(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-add\" target=\"_blank\" rel=\"noopener noreferrer\">add(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc\" target=\"_blank\" rel=\"noopener noreferrer\">add.cc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: add</h1><section id=\"floating-point-instructions-add\">\n\n\n<p>Add two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>add{.rnd}{.ftz}{.sat}.f32  d, a, b;\nadd{.rnd}.f64              d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs addition and writes the resulting value into a destination register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a + b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt>\n<dd>\n<p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>The default value of rounding modifier is <code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>. Note that an <code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. An <code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> sequences with no rounding modifiers may be optimized to\nuse fused-multiply-add instructions on the target device.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>Saturation modifier:</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.sat.f32</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>Rounding modifiers have the following target requirements:</p>\n<dl>\n<dt>\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code>\n</dt>\n<dd>\n<p>available for all targets</p>\n</dd>\n<dt>\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code>\n</dt>\n<dd>\n<p>for <code class=\"docutils literal notranslate\"><span class=\"pre\">add.f64</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>for <code class=\"docutils literal notranslate\"><span class=\"pre\">add.f32</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n</dd>\n</dl>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>@p  add.rz.ftz.f32  f1,f2,f3;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: add</h1><section id=\"half-precision-floating-point-instructions-add\">\n\n\n<p>Add two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>add{.rnd}{.ftz}{.sat}.f16   d, a, b;\nadd{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nadd{.rnd}.bf16   d, a, b;\nadd{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs addition and writes the resulting value into a destination register.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, forms input vectors by half word values from source\noperands. Half-word operands are then added in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> result\nin destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>\ninstruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type,\noperands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    d = a + b;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = fA[i] + fB[i];\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt>\n<dd>\n<p>mantissa LSB rounds to nearest even</p>\n</dd>\n</dl>\n<p>The default value of rounding modifier is <code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>. Note that an <code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. An <code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> sequences with no rounding modifiers may be optimized to\nuse fused-multiply-add instructions on the target device.</p>\n<dl class=\"simple\">\n<dt>Subnormal numbers:</dt>\n<dd>\n<p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">add.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt>Saturation modifier:</dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.sat.{f16,</span> <span class=\"pre\">f16x2}</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add{.rnd}.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">add{.rnd}.bf16x2</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add{.rnd}.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">add{.rnd}.bf16x2</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// scalar f16 additions\nadd.f16        d0, a0, b0;\nadd.rn.f16     d1, a1, b1;\nadd.bf16       bd0, ba0, bb0;\nadd.rn.bf16    bd1, ba1, bb1;\n\n// SIMD f16 addition\ncvt.rn.f16.f32 h0, f0;\ncvt.rn.f16.f32 h1, f1;\ncvt.rn.f16.f32 h2, f2;\ncvt.rn.f16.f32 h3, f3;\nmov.b32  p1, {h0, h1};   // pack two f16 to 32bit f16x2\nmov.b32  p2, {h2, h3};   // pack two f16 to 32bit f16x2\nadd.f16x2  p3, p1, p2;   // SIMD f16x2 addition\n\n// SIMD bf16 addition\ncvt.rn.bf16x2.f32 p4, f4, f5; // Convert two f32 into packed bf16x2\ncvt.rn.bf16x2.f32 p5, f6, f7; // Convert two f32 into packed bf16x2\nadd.bf16x2  p6, p4, p5;       // SIMD bf16x2 addition\n\n// SIMD fp16 addition\nld.global.b32   f0, [addr];     // load 32 bit which hold packed f16x2\nld.global.b32   f1, [addr + 4]; // load 32 bit which hold packed f16x2\nadd.f16x2       f2, f0, f1;     // SIMD f16x2 addition\n\nld.global.b32   f3, [addr + 8];  // load 32 bit which hold packed bf16x2\nld.global.b32   f4, [addr + 12]; // load 32 bit which hold packed bf16x2\nadd.bf16x2      f5, f3, f4;      // SIMD bf16x2 addition\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: add</h1><section id=\"integer-arithmetic-instructions-add\">\n\n\n<p>Add two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>add.type       d, a, b;\nadd{.sat}.s32  d, a, b;     // .sat applies only to .s32\n\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64,\n          .u16x2, .s16x2 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs addition and writes the resulting value into a destination register.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code> instruction types, forms input vectors by half word values from source\noperands. Half-word operands are then added in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code> result in\ndestination.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.type</span></code>. For instruction types <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code>,\noperands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (type == u16x2 || type == s16x2) {\n    iA[0] = a[0:15];\n    iA[1] = a[16:31];\n    iB[0] = b[0:15];\n    iB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = iA[i] + iB[i];\n    }\n} else {\n    d = a + b;\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Saturation modifier:</p>\n<dl class=\"simple\">\n<dt>.sat</dt>\n<dd>\n<p>limits result to <code class=\"docutils literal notranslate\"><span class=\"pre\">MININT..MAXINT</span></code> (no overflow) for the size of the operation. Applies only to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> type.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.u16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">add.s16x2</span></code> introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.u16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">add.s16x2</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>@p  add.u32     x,y,z;\n    add.sat.s32 c,c,1;\n    add.u16x2   u,v,w;\n</pre></div>\n</div>\n</section>\n<h1>Extended-Precision Arithmetic Instructions: add.cc</h1><section id=\"extended-precision-arithmetic-instructions-add-cc\">\n\n\n<p>Add two values with carry-out.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>add.cc.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs integer addition and writes the carry-out value into the condition code register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a + b;\n</pre></div>\n</div>\n<p>carry-out written to <code class=\"docutils literal notranslate\"><span class=\"pre\">CC.CF</span></code></p>\n<p><strong>Notes</strong></p>\n<p>No integer rounding modifiers.</p>\n<p>No saturation.</p>\n<p>Behavior is the same for unsigned and signed integers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">add.cc</span></code> introduced in PTX ISA version 1.2.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">add.cc</span></code> introduced in PTX ISA version 4.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">add.cc</span></code> is supported on all target architectures.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">add.cc</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>@p  add.cc.u32   x1,y1,z1;   // extended-precision addition of\n@p  addc.cc.u32  x2,y2,z2;   // two 128-bit values\n@p  addc.cc.u32  x3,y3,z3;\n@p  addc.u32     x4,y4,z4;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: add\n\n\n\nAdd two values.\n\nSyntax\n\nadd{.rnd}{.ftz}{.sat}.f32  d, a, b;\n\nadd{.rnd}.f64              d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nPerforms addition and writes the resulting value into a destination register.\n\nSemantics\n\nd = a + b;\n\nNotes\n\nRounding modifiers:\n\n.rn\n\nmantissa LSB rounds to nearest even\n\n.rz\n\nmantissa LSB rounds towards zero\n\n.rm\n\nmantissa LSB rounds towards negative infinity\n\n.rp\n\nmantis...\n\n=====Half Precision Floating Point Instructions: add\n\n\n\nAdd two values.\n\nSyntax\n\nadd{.rnd}{.ftz}{.sat}.f16   d, a, b;\n\nadd{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nadd{.rnd}.bf16   d, a, b;\n\nadd{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n\nDescription\n\nPerforms addition and writes the resulting value into a destination register.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vectors by half word values from source\n\noperands. Half-word operands are then added in paral...\n\n=====Integer Arithmetic Instructions: add\n\n\n\nAdd two values.\n\nSyntax\n\nadd.type       d, a, b;\n\nadd{.sat}.s32  d, a, b;     // .sat applies only to .s32\n\n.type = { .u16, .u32, .u64,\n\n          .s16, .s32, .s64,\n\n          .u16x2, .s16x2 };\n\nDescription\n\nPerforms addition and writes the resulting value into a destination register.\n\nFor .u16x2, .s16x2 instruction types, forms input vectors by half word values from source\n\noperands. Half-word operands are...\n\n=====Extended-Precision Arithmetic Instructions: add.cc\n\n\n\nAdd two values with carry-out.\n\nSyntax\n\nadd.cc.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nPerforms integer addition and writes the carry-out value into the condition code register.\n\nSemantics\n\nd = a + b;\n\ncarry-out written to CC.CF\n\nNotes\n\nNo integer rounding modifiers.\n\nNo saturation.\n\nBehavior is the same for unsigned and signed integers.\n\nPTX ISA Notes\n\n32-bit add.cc introduced in PTX ... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add"
            };

        case "addc":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc\" target=\"_blank\" rel=\"noopener noreferrer\">addc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Extended-Precision Arithmetic Instructions: addc</h1><section id=\"extended-precision-arithmetic-instructions-addc\">\n\n\n<p>Add two values with carry-in and optional carry-out.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>addc{.cc}.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs integer addition with carry-in and optionally writes the carry-out value into the condition\ncode register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a + b + CC.CF;\n</pre></div>\n</div>\n<p>if <code class=\"docutils literal notranslate\"><span class=\"pre\">.cc</span></code> specified, carry-out written to <code class=\"docutils literal notranslate\"><span class=\"pre\">CC.CF</span></code></p>\n<p><strong>Notes</strong></p>\n<p>No integer rounding modifiers.</p>\n<p>No saturation.</p>\n<p>Behavior is the same for unsigned and signed integers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">addc</span></code> introduced in PTX ISA version 1.2.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">addc</span></code> introduced in PTX ISA version 4.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">addc</span></code> is supported on all target architectures.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">addc</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>@p  add.cc.u32   x1,y1,z1;   // extended-precision addition of\n@p  addc.cc.u32  x2,y2,z2;   // two 128-bit values\n@p  addc.cc.u32  x3,y3,z3;\n@p  addc.u32     x4,y4,z4;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Add two values with carry-in and optional carry-out.\n\nSyntax\n\naddc{.cc}.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nPerforms integer addition with carry-in and optionally writes the carry-out value into the condition\n\ncode register.\n\nSemantics\n\nd = a + b + CC.CF;\n\nif .cc specified, carry-out written to CC.CF\n\nNotes\n\nNo integer rounding modifiers.\n\nNo saturation.\n\nBehavior is the same for unsigned and signed integers.\n\nPTX ISA Notes\n\n32-bit addc introduced in PTX ISA version 1.2.\n\n64-bit addc introduced in PTX ISA version 4.3.\n\nTarget ISA Notes\n\n32-bit addc is supported on all target architectures.\n\n64-bit addc requires sm_20 or higher.\n\nExamples\n\n@p  add.cc.u32   x1,y1,z1;   // extended-precision addition of\n\n@p  addc.cc.u32  x2,y2,z2;   // two 128-bit values\n\n@p  addc.cc.u32  x3,y3,z3;\n\n@p  addc.u32     x4,y4,z4;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc"
            };

        case "address_size":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-module-directives-address-size\" target=\"_blank\" rel=\"noopener noreferrer\">address_size <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>PTX Module Directives: .address_size</h1><section id=\"ptx-module-directives-address-size\">\n\n\n<p>Address size used throughout PTX module.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.address_size  address-size\naddress-size = { 32, 64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Specifies the address size assumed throughout the module by the PTX code and the binary DWARF\ninformation in PTX.</p>\n<p>Redefinition of this directive within a module is not allowed. In the presence of separate\ncompilation all modules must specify (or default to) the same address size.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.address_size</span></code> directive is optional, but it must immediately follow the <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span></code>directive if present within a module.</p>\n<p><strong>Semantics</strong></p>\n<p>If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.address_size</span></code> directive is omitted, the address size defaults to 32.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// example directives\n   .address_size 32       // addresses are 32 bit\n   .address_size 64       // addresses are 64 bit\n\n// example of directive placement within a module\n   .version 2.3\n   .target sm_20\n   .address_size 64\n...\n.entry foo () {\n...\n}\n</pre></div>\n</div>\n</section>",
                "tooltip": "Address size used throughout PTX module.\n\nSyntax\n\n.address_size  address-size\n\naddress-size = { 32, 64 };\n\nDescription\n\nSpecifies the address size assumed throughout the module by the PTX code and the binary DWARF\n\ninformation in PTX.\n\nRedefinition of this directive within a module is not allowed. In the presence of separate\n\ncompilation all modules must specify (or default to) the same address size.\n\nThe .address_size directive is optional, but it must immediately follow the .targetdirective if present within a module.\n\nSemantics\n\nIf the .address_size directive is omitted, the address size defaults to 32.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n// example directives\n\n   .address_size 32       // addresses are 32 bit\n\n   .address_size 64       // addresses are 64 bit\n\n// example of directive placement within a module\n\n   .version 2.3\n\n   .target sm_20\n\n   .address_size 64\n\n...\n\n.entry foo () {\n\n...\n\n}\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-module-directives-address-size"
            };

        case "aggr_smem_size":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-aggr-smem-size\" target=\"_blank\" rel=\"noopener noreferrer\">aggr_smem_size <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %aggr_smem_size</h1><section id=\"special-registers-aggr-smem-size\">\n\n\n<p>Total size of shared memory used by a CTA of a kernel.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %aggr_smem_size;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with total aggregated size of shared memory\nconsisting of the size of user shared memory allocated (statically and dynamically) at launch time\nand the size of shared memory region which is reserved for the NVIDIA system software use.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32  %r, %aggr_smem_size;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Total size of shared memory used by a CTA of a kernel.\n\nSyntax (predefined)\n\n.sreg .u32 %aggr_smem_size;\n\nDescription\n\nA predefined, read-only special register initialized with total aggregated size of shared memory\n\nconsisting of the size of user shared memory allocated (statically and dynamically) at launch time\n\nand the size of shared memory region which is reserved for the NVIDIA system software use.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 8.1.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\nmov.u32  %r, %aggr_smem_size;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-aggr-smem-size"
            };

        case "alias":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-alias\" target=\"_blank\" rel=\"noopener noreferrer\">alias <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Kernel and Function Directives: .alias</h1><section id=\"kernel-and-function-directives-alias\">\n\n\n<p>Define an alias to existing function symbol.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.alias fAlias, fAliasee;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.alias</span></code> is a module scope directive that defines identifier <code class=\"docutils literal notranslate\"><span class=\"pre\">fAlias</span></code> to be an alias to function\nspecified by <code class=\"docutils literal notranslate\"><span class=\"pre\">fAliasee</span></code>.</p>\n<p>Both <code class=\"docutils literal notranslate\"><span class=\"pre\">fAlias</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">fAliasee</span></code> are non-entry function symbols.</p>\n<p>Identifier <code class=\"docutils literal notranslate\"><span class=\"pre\">fAlias</span></code> is a function declaration without body.</p>\n<p>Identifier <code class=\"docutils literal notranslate\"><span class=\"pre\">fAliasee</span></code> is a function symbol which must be defined in the same module as <code class=\"docutils literal notranslate\"><span class=\"pre\">.alias</span></code>\ndeclaration. Function <code class=\"docutils literal notranslate\"><span class=\"pre\">fAliasee</span></code> cannot have <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> linkage.</p>\n<p>Prototype of <code class=\"docutils literal notranslate\"><span class=\"pre\">fAlias</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">fAliasee</span></code> must match.</p>\n<p>Program can use either <code class=\"docutils literal notranslate\"><span class=\"pre\">fAlias</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">fAlisee</span></code> identifiers to reference function defined with\n<code class=\"docutils literal notranslate\"><span class=\"pre\">fAliasee</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.alias</span></code> directive introduced in PTX ISA 6.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.alias</span></code> directive requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.visible .func foo(.param .u32 p) {\n   ...\n}\n.visible .func bar(.param .u32 p);\n.alias bar, foo;\n.entry test()\n{\n      .param .u32 p;\n      ...\n      call foo, (p);       // call foo directly\n       ...\n       .param .u32 p;\n       call bar, (p);        // call foo through alias\n}\n.entry filter ( .param .b32 x, .param .b32 y, .param .b32 z )\n{\n    .reg .b32 %r1, %r2, %r3;\n    ld.param.b32  %r1, [x];\n    ld.param.b32  %r2, [y];\n    ld.param.b32  %r3, [z];\n    ...\n}\n</pre></div>\n</div>\n</section>",
                "tooltip": "Define an alias to existing function symbol.\n\nSyntax\n\n.alias fAlias, fAliasee;\n\nDescription\n\n.alias is a module scope directive that defines identifier fAlias to be an alias to function\n\nspecified by fAliasee.\n\nBoth fAlias and fAliasee are non-entry function symbols.\n\nIdentifier fAlias is a function declaration without body.\n\nIdentifier fAliasee is a function symbol which must be defined in the same module as .alias\n\ndeclaration. Function fAliasee cannot have .weak linkage.\n\nPrototype of fAlias and fAliasee must match.\n\nProgram can use either fAlias or fAlisee identifiers to reference function defined with\n\nfAliasee.\n\nPTX ISA Notes\n\n.alias directive introduced in PTX ISA 6.3.\n\nTarget ISA Notes\n\n.alias directive requires sm_30 or higher.\n\nExamples\n\n.visible .func foo(.param .u32 p) {\n\n   ...\n\n}\n\n.visible .func bar(.param .u32 p);\n\n.alias bar, foo;\n\n.entry test()\n\n{\n\n      .param .u32 p;\n\n      ...\n\n      call foo, (p);       // call foo directly\n\n       ...\n\n       .param .u32 p;\n\n       call bar, (p);        // call foo through alias\n\n}\n\n.entry filter ( .param .b32 x, .param .b32 y, .param .b32 z )\n\n{\n\n    .reg .b32 %r1, %r2, %r3;\n\n    ld.param.b32  %r1, [x];\n\n    ld.param.b32  %r2, [y];\n\n    ld.param.b32  %r3, [z];\n\n    ...\n\n}\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-alias"
            };

        case "alloca":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca\" target=\"_blank\" rel=\"noopener noreferrer\">alloca <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Stack Manipulation Instructions: alloca</h1><section id=\"stack-manipulation-instructions-alloca\">\n\n\n<p>Dynamically allocate memory on stack.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>alloca.type  ptr, size{, immAlign};\n\n.type = { .u32, .u64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code> instruction dynamically allocates memory on the stack frame of the current function\nand updates the stack pointer accordingly. The returned pointer <code class=\"docutils literal notranslate\"><span class=\"pre\">ptr</span></code> points to local memory and\ncan be used in the address operand of <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.local</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">st.local</span></code> instructions.</p>\n<p>If sufficient memory is unavailable for allocation on the stack, then execution of <code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code> may\nresult in stack overflow. In such cases, attempting to access the allocated memory with <code class=\"docutils literal notranslate\"><span class=\"pre\">ptr</span></code> will\nresult in undefined program behavior.</p>\n<p>The memory allocated by <code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code> is deallocated in the following ways:</p>\n<ul class=\"simple\">\n<li><p>It is automatically deallocated when the function exits.</p></li>\n<li><p>It can be explicitly deallocated using <code class=\"docutils literal notranslate\"><span class=\"pre\">stacksave</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">stackrestore</span></code> instructions:\n<code class=\"docutils literal notranslate\"><span class=\"pre\">stacksave</span></code> can be used to save the value of stack pointer before executing <code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code>, and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">stackrestore</span></code> can be used after <code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code> to restore stack pointer to the original value which\nwas previously saved with <code class=\"docutils literal notranslate\"><span class=\"pre\">stacksave</span></code>. Note that accessing deallocated memory after executing\n<code class=\"docutils literal notranslate\"><span class=\"pre\">stackrestore</span></code> results in undefined behavior.</p></li>\n</ul>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> is an unsigned value which specifies the amount of memory in number of bytes to be\nallocated on stack. <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span> <span class=\"pre\">=</span> <span class=\"pre\">0</span></code> may not lead to a valid memory allocation.</p>\n<p>Both <code class=\"docutils literal notranslate\"><span class=\"pre\">ptr</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> have the same type as the instruction type.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">immAlign</span></code> is a 32-bit value which specifies the alignment requirement in number of bytes for the\nmemory allocated by <code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code>. It is an integer constant, must be a power of 2 and must not exceed\n2^23. <code class=\"docutils literal notranslate\"><span class=\"pre\">immAlign</span></code> is an optional argument with default value being 8 which is the minimum\nguaranteed alignment.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>alloca.type ptr, size, immAlign:\n\na = max(immAlign, frame_align); // frame_align is the minimum guaranteed alignment\n\n// Allocate size bytes of stack memory with alignment a and update the stack pointer.\n// Since the stack grows down, the updated stack pointer contains a lower address.\nstackptr = alloc_stack_mem(size, a);\n\n// Return the new value of stack pointer as ptr. Since ptr is the lowest address of the memory\n// allocated by alloca, the memory can be accessed using ptr up to (ptr + size of allocated memory).\nstacksave ptr;\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.3.</p>\n<dl class=\"simple\">\n<dt>Preview Feature:</dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code> is a preview feature in PTX ISA version 7.3. All details are subject to change with no\nguarantees of backward compatibility on future PTX ISA versions or SM architectures.</p>\n</dd>\n</dl>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_52</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .u32 ra, stackptr, ptr, size;\n\nstacksave.u32 stackptr;     // Save the current stack pointer\nalloca ptr, size, 8;        // Allocate stack memory\nst.local.u32 [ptr], ra;     // Use the allocated stack memory\nstackrestore.u32 stackptr;  // Deallocate memory by restoring the stack pointer\n</pre></div>\n</div>\n</section>",
                "tooltip": "Dynamically allocate memory on stack.\n\nSyntax\n\nalloca.type  ptr, size{, immAlign};\n\n.type = { .u32, .u64 };\n\nDescription\n\nThe alloca instruction dynamically allocates memory on the stack frame of the current function\n\nand updates the stack pointer accordingly. The returned pointer ptr points to local memory and\n\ncan be used in the address operand of ld.local and st.local instructions.\n\nIf sufficient memory is unavailable for allocation on the stack, then execution of alloca may\n\nresult in stack overflow. In such cases, attempting to access the allocated memory with ptr will\n\nresult in undefined program behavior.\n\nThe memory allocated by alloca is deallocated in the following ways:\n\nIt is automatically deallocated when the function exits.\n\nIt can be explicitly deallocated using stacksave and stackrestore instructions:\n\nstacksave can be used to save the value of stack pointer before executing alloca, and\n\nstackrestore can be used after alloca to restore stack pointer to the original value which\n\nwas previously saved with stacksave. Note that accessing deallocated memory after executing\n\nstackrestore results in undefined behavior.\n\nsize is an unsigned value which specifies the amount of memory in number of bytes to be\n\nallocated on stack. size = 0 may not lead to a valid memory allocation.\n\nBoth ptr and size have the same type as the instruction type.\n\nimmAlign is a 32-bit value which specifies the alignment requirement in number of bytes for the\n\nmemory allocated by alloca. It is an integer constant, must be a power of 2 and must not exceed\n\n2^23. immAlign is an optional argument with default value being 8 which is the minimum\n\nguaranteed alignment.\n\nSemantics\n\nalloca.type ptr, size, immAlign:\n\na = max(immAlign, frame_align); // frame_align is the minimum guaranteed alignment\n\n// Allocate size bytes of stack memory with alignment a and update the stack pointer.\n\n// Since the stack grows down, the updated stack pointer contains a lower address.\n\nstackptr = alloc_stack_mem(size, a);\n\n// Return the new value of stack pointer as ptr. Since ptr is the lowest address of the memory\n\n// allocated by alloca, the memory can be accessed using ptr up to (ptr + size of allocated memory).\n\nstacksave ptr;\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.3.\n\nPreview Feature:\n\nalloca is a preview feature in PTX ISA version 7.3. All details are subject to change with no\n\nguarantees of backward compatibility on future PTX ISA versions or SM architectures.\n\nTarget ISA Notes\n\nalloca requires sm_52 or higher.\n\nExamples\n\n.reg .u32 ra, stackptr, ptr, size;\n\nstacksave.u32 stackptr;     // Save the current stack pointer\n\nalloca ptr, size, 8;        // Allocate stack memory\n\nst.local.u32 [ptr], ra;     // Use the allocated stack memory\n\nstackrestore.u32 stackptr;  // Deallocate memory by restoring the stack pointer\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca"
            };

        case "and":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and\" target=\"_blank\" rel=\"noopener noreferrer\">and <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: and</h1><section id=\"logic-and-shift-instructions-and\">\n\n\n<p>Bitwise AND.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>and.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the bit-wise and operation for the bits in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a &amp; b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The size of the operands must match, but not necessarily the type.</p>\n<p>Allowed types include predicate registers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>and.b32  x,q,r;\nand.b32  sign,fpvalue,0x80000000;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Bitwise AND.\n\nSyntax\n\nand.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n\nDescription\n\nCompute the bit-wise and operation for the bits in a and b.\n\nSemantics\n\nd = a & b;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nAllowed types include predicate registers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nand.b32  x,q,r;\n\nand.b32  sign,fpvalue,0x80000000;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and"
            };

        case "applypriority":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority\" target=\"_blank\" rel=\"noopener noreferrer\">applypriority <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: applypriority</h1><section id=\"data-movement-and-conversion-instructions-applypriority\">\n\n\n<p>Apply the cache eviction priority to the specified address in the specified cache level.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>applypriority{.global}.level::eviction_priority  [a], size;\n\n.level::eviction_priority = { .L2::evict_normal };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">applypriority</span></code> instruction applies the cache eviction priority specified by the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> qualifier to the address range <code class=\"docutils literal notranslate\"><span class=\"pre\">[a..a+size)</span></code> in the specified cache\nlevel.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the specified address does not fall within the address window of <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space\nthen the behavior is undefined.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> is an integer constant that specifies the amount of data, in bytes, in the\nspecified cache level on which the priority is to be applied. The only supported value for the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> operand is 128.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> must be aligned to 128 bytes.</p>\n<p>If the data pointed to by address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is not already present in the specified cache level, then\nthe data will be prefetched before applying the specified priority.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.4.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>applypriority.global.L2::evict_normal [ptr], 128;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Apply the cache eviction priority to the specified address in the specified cache level.\n\nSyntax\n\napplypriority{.global}.level::eviction_priority  [a], size;\n\n.level::eviction_priority = { .L2::evict_normal };\n\nDescription\n\nThe applypriority instruction applies the cache eviction priority specified by the\n\n.level::eviction_priority qualifier to the address range [a..a+size) in the specified cache\n\nlevel.\n\nIf no state space is specified then Generic Addressing is\n\nused. If the specified address does not fall within the address window of .global state space\n\nthen the behavior is undefined.\n\nThe operand size is an integer constant that specifies the amount of data, in bytes, in the\n\nspecified cache level on which the priority is to be applied. The only supported value for the\n\nsize operand is 128.\n\nSupported addressing modes for operand a are described in Addresses as Operands. a must be aligned to 128 bytes.\n\nIf the data pointed to by address a is not already present in the specified cache level, then\n\nthe data will be prefetched before applying the specified priority.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.4.\n\nTarget ISA Notes\n\nRequires sm_80 or higher.\n\nExamples\n\napplypriority.global.L2::evict_normal [ptr], 128;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority"
            };

        case "atom":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom\" target=\"_blank\" rel=\"noopener noreferrer\">atom <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: atom</h1><section id=\"parallel-synchronization-and-communication-instructions-atom\">\n\n\n<p>Atomic reduction operations for thread-to-thread communication.</p>\n<p><strong>Syntax</strong></p>\n<p>Atomic operation with scalar type:</p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>atom{.sem}{.scope}{.space}.op{.level::cache_hint}.type d, [a], b{, cache-policy};\natom{.sem}{.scope}{.space}.op.type d, [a], b, c;\n\natom{.sem}{.scope}{.space}.cas.b16 d, [a], b, c;\n\natom{.sem}{.scope}{.space}.cas.b128 d, [a], b, c {, cache-policy};\natom{.sem}{.scope}{.space}.exch{.level::cache_hint}.b128 d, [a], b {, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16     d, [a], b{, cache-policy};\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16x2   d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16    d, [a], b{, cache-policy};\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16x2  d, [a], b{, cache-policy};\n\n.space =              { .global, .shared{::cta, ::cluster} };\n.sem =                { .relaxed, .acquire, .release, .acq_rel };\n.scope =              { .cta, .cluster, .gpu, .sys };\n\n.op =                 { .and, .or, .xor,\n                        .cas, .exch,\n                        .add, .inc, .dec,\n                        .min, .max };\n.level::cache_hint =  { .L2::cache_hint };\n.type =               { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64 };\n</pre></div>\n</div>\n<p>Atomic operation with vector type:</p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>atom{.sem}{.scope}{.global}.add{.level::cache_hint}.vec_32_bit.f32                  d, [a], b{, cache-policy};\natom{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_16_bit.half_word_type  d, [a], b{, cache-policy};\natom{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_32_bit.packed_type     d, [a], b{, cache-policy};\n\n.sem =               { .relaxed, .acquire, .release, .acq_rel };\n.scope =             { .cta, .cluster, .gpu, .sys };\n.op =                { .add, .min, .max };\n.half_word_type =    { .f16, .bf16 };\n.packed_type =       { .f16x2, .bf16x2 };\n.vec_16_bit =        { .v2, .v4, .v8 }\n.vec_32_bit =        { .v2, .v4 };\n.level::cache_hint = { .L2::cache_hint }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Atomically loads the original value at location <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> into destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, performs a\nreduction operation with operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and the value in location <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, and stores the result of the\nspecified operation at location <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, overwriting the original value. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> specifies a\nlocation in the specified state space. If no state space is given, perform the memory accesses using\n<a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a>. <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> with scalar type may be used only\nwith <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> spaces and with generic addressing, where the address points to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> space. <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> with vector type may be used only with <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> space\nand with generic addressing where the address points to <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> space.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> with vector type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> are brace-enclosed vector expressions, size\nof which is equal to the size of vector qualifier.</p>\n<p>If no sub-qualifier is specified with <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space, then <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> is assumed by default.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier specifies a memory synchronizing effect as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory\nConsistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier is absent,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> is assumed by default.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier specifies the set of threads that can directly observe the memory\nsynchronizing effect of this operation, as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is absent, <code class=\"docutils literal notranslate\"><span class=\"pre\">.gpu</span></code> scope is\nassumed by default.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> with vector type, the supported combinations of vector qualifier and types, and atomic\noperations supported on these combinations are depicted in the following table:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 19%\"/>\n<col style=\"width: 32%\"/>\n<col style=\"width: 32%\"/>\n<col style=\"width: 16%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\" rowspan=\"2\"><p>Vector qualifier</p></th>\n<th class=\"head\" colspan=\"3\"><p>Types</p></th>\n</tr>\n<tr class=\"row-even\">\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>/ <code class=\"docutils literal notranslate\"><span class=\"pre\">bf16</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>/ <code class=\"docutils literal notranslate\"><span class=\"pre\">bf16x2</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code></p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.v2</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.v4</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.v8</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p>Not supported</p></td>\n<td><p>Not Supported</p></td>\n</tr>\n</tbody>\n</table>\n<p>Two atomic operations {<code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code>} are performed atomically with respect to each other only\nif each operation specifies a scope that includes the other. When this condition is not met, each\noperation observes the other operation being performed as if it were split into a read followed by a\ndependent write.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> instruction on packed type or vector type, accesses adjacent scalar elements in memory. In\nsuch cases, the atomicity is guaranteed separately for each of the individual scalar elements; the\nentire <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> is not guaranteed to be atomic as a single access.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code> and earlier architectures, <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> operations on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space do not\nguarantee atomicity with respect to normal store instructions to the same address. It is the\nprogrammer\u2019s responsibility to guarantee correctness of programs that use shared memory atomic\ninstructions, e.g., by inserting barriers between normal stores and atomic operations to a common\naddress, or by using atom.exch to store to locations accessed by other atomic operations.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses\nas Operands</a></p>\n<p>The bit-size operations are <code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.cas</span></code> (compare-and-swap), and <code class=\"docutils literal notranslate\"><span class=\"pre\">.exch</span></code>\n(exchange).</p>\n<p>The integer operations are <code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code>. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code> operations return a result in the range <code class=\"docutils literal notranslate\"><span class=\"pre\">[0..b]</span></code>.</p>\n<p>The floating-point operation <code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code> operation rounds to nearest even. Current implementation of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f32</span></code> on global memory flushes subnormal inputs and results to sign-preserving zero;\nwhereas <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f32</span></code> on shared memory supports subnormal inputs and results and doesn\u2019t flush\nthem to zero.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.bf16x2</span></code> operation requires\nthe <code class=\"docutils literal notranslate\"><span class=\"pre\">.noftz</span></code> qualifier; it preserves subnormal inputs and results, and does not flush them to\nzero.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space and for generic\naddressing where the address points to the <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>atomic {\n    d = *a;\n    *a = (operation == cas) ? operation(*a, b, c)\n                            : operation(*a, b);\n}\nwhere\n    inc(r, s)  = (r &gt;= s) ? 0 : r+1;\n    dec(r, s)  = (r==0 || r &gt; s)  ? s : r-1;\n    exch(r, s) =  s;\n    cas(r,s,t) = (r == s) ? t : r;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Simple reductions may be specified by using the <em>bit bucket</em> destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">_</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>32-bit atom.global introduced in PTX ISA version 1.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.shared</span></code> and 64-bit<code class=\"docutils literal notranslate\"><span class=\"pre\">atom.global.{add,cas,exch}</span></code> introduced in PTX ISA 1.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f32</span></code> and 64-bit<code class=\"docutils literal notranslate\"><span class=\"pre\">atom.shared.{add,cas,exch}</span></code> introduced in PTX ISA 2.0.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.{and,or,xor,min,max}</span></code> introduced in PTX ISA 3.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f64</span></code> introduced in PTX ISA 5.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier introduced in PTX ISA 5.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier introduced in PTX ISA version 6.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.f16x2</span></code> introduced in PTX ISA 6.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.f16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.cas.b16</span></code> introduced in PTX ISA 6.3.</p>\n<p>Per-element atomicity of <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.f16x2</span></code> clarified in PTX ISA version 6.3, with retrospective effect\nfrom PTX ISA version 6.2.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier introduced in PTX ISA version 7.4.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.bf16x2</span></code> introduced in PTX ISA 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> sub-qualifiers introduced in PTX ISA version 7.8.</p>\n<p>Support for vector types introduced in PTX ISA version 8.1.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.b128</span></code> type introduced in PTX ISA version 8.3.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.sys</span></code> scope with <code class=\"docutils literal notranslate\"><span class=\"pre\">.b128</span></code> type introduced in PTX ISA version 8.4.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.global</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_11</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.shared</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_12</span></code> or higher.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.global.{add,cas,exch}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_12</span></code> or higher.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.shared.{add,cas,exch}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.{and,or,xor,min,max}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_32</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f32</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Use of generic addressing requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.f16x2</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.f16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.cas.b16</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.bf16x2</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for vector types requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.b128</span></code> type requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>atom.global.add.s32  d,[a],1;\natom.shared::cta.max.u32  d,[x+4],0;\n@p  atom.global.cas.b32  d,[p],my_val,my_new_val;\natom.global.sys.add.u32 d, [a], 1;\natom.global.acquire.sys.inc.u32 ans, [gbl], %r0;\natom.add.noftz.f16x2 d, [a], b;\natom.add.noftz.f16   hd, [ha], hb;\natom.global.cas.b16  hd, [ha], hb, hc;\natom.add.noftz.bf16   hd, [a], hb;\natom.add.noftz.bf16x2 bd, [b], bb;\natom.add.shared::cluster.noftz.f16   hd, [ha], hb;\natom.shared.b128.cas d, a, b, c; // 128-bit atom\natom.global.b128.exch d, a, b;   // 128-bit atom\n\natom.global.cluster.relaxed.add.u32 d, [a], 1;\n\ncreatepolicy.fractional.L2::evict_last.b64 cache-policy, 0.25;\natom.global.add.L2::cache_hint.s32  d, [a], 1, cache-policy;\n\natom.global.v8.f16.max.noftz  {%hd0, %hd1, %hd2, %hd3, %hd4, %hd5, %hd6, %hd7}, [gbl],\n                                              {%h0, %h1, %h2, %h3, %h4, %h5, %h6, %h7};\natom.global.v8.bf16.add.noftz  {%hd0, %hd1, %hd2, %hd3, %hd4, %hd5, %hd6, %hd7}, [gbl],\n                                              {%h0, %h1, %h2, %h3, %h4, %h5, %h6, %h7};\natom.global.v2.f16.add.noftz  {%hd0, %hd1}, [gbl], {%h0, %h1};\natom.global.v2.bf16.add.noftz  {%hd0, %hd1}, [gbl], {%h0, %h1};\natom.global.v4.b16x2.min.noftz  {%hd0, %hd1, %hd2, %hd3}, [gbl], {%h0, %h1, %h2, %h3};\natom.global.v4.f32.add  {%f0, %f1, %f2, %f3}, [gbl], {%f0, %f1, %f2, %f3};\natom.global.v2.f16x2.min.noftz  {%bd0, %bd1}, [g], {%b0, %b1};\natom.global.v2.bf16x2.max.noftz  {%bd0, %bd1}, [g], {%b0, %b1};\natom.global.v2.f32.add  {%f0, %f1}, [g], {%f0, %f1};\n</pre></div>\n</div>\n</section>",
                "tooltip": "Atomic reduction operations for thread-to-thread communication.\n\nSyntax\n\nAtomic operation with scalar type:\n\natom{.sem}{.scope}{.space}.op{.level::cache_hint}.type d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.op.type d, [a], b, c;\n\natom{.sem}{.scope}{.space}.cas.b16 d, [a], b, c;\n\natom{.sem}{.scope}{.space}.cas.b128 d, [a], b, c {, cache-policy};\n\natom{.sem}{.scope}{.space}.exch{.level::cache_hint}.b128 d, [a], b {, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16     d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16x2   d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16    d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16x2  d, [a], b{, cache-policy};\n\n.space =              { .global, .shared{::cta, ::cluster} };\n\n.sem =                { .relaxed, .acquire, .release, .acq_rel };\n\n.scope =              { .cta, .cluster, .gpu, .sys };\n\n.op =                 { .and, .or, .xor,\n\n                        .cas, .exch,\n\n                        .add, .inc, .dec,\n\n                        .min, .max };\n\n.level::cache_hint =  { .L2::cache_hint };\n\n.type =               { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64 };\n\nAtomic operation with vector type:\n\natom{.sem}{.scope}{.global}.add{.level::cache_hint}.vec_32_bit.f32                  d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_16_bit.half_word_type  d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_32_bit.packed_type     d, [a], b{, cache-policy};\n\n.sem =               { .relaxed, .acquire, .release, .acq_rel };\n\n.scope =             { .cta, .cluster, .gpu, .sys };\n\n.op =                { .add, .min, .max };\n\n.half_word_type =    { .f16, .bf16 };\n\n.packed_type =       { .f16x2, .bf16x2 };\n\n.vec_16_bit =        { .v2, .v4, .v8 }\n\n.vec_32_bit =        { .v2, .v4 };\n\n.level::cache_hint = { .L2::cache_hint }\n\nDescription\n\nAtomically loads the original value at location a into destination register d, performs a\n\nreduction operation with operand b and the value in location a, and stores the result of the\n\nspecified operation at location a, overwriting the original value. Operand a specifies a\n\nlocation in the specified state space. If no state space is given, perform the memory accesses using\n\nGeneric Addressing. atom with scalar type may be used only\n\nwith .global and .shared spaces and with generic addressing, where the address points to\n\n.global or .shared space. atom with vector type may be used only with .global space\n\nand with generic addressing where the address points to .global space.\n\nFor atom with vector type, operands d and b are brace-enclosed vector expressions, size\n\nof which is equal to the size of vector qualifier.\n\nIf no sub-qualifier is specified with .shared state space, then ::cta is assumed by default.\n\nThe optional .sem qualifier specifies a memory synchronizing effect as described in the Memory\n\nConsistency Model. If the .sem qualifier is absent,\n\n.relaxed is assumed by default.\n\nThe optional .scope qualifier specifies the set of threads that can directly observe the memory\n\nsynchronizing effect of this operation, as described in the Memory Consistency Model. If the .scope qualifier is absent, .gpu scope is\n\nassumed by default.\n\nFor atom with vector type, the supported combinations of vector qualifier and types, and atomic\n\noperations supported on these combinations are depicted in the following table:\n\n\n\n\n\nVector qualifier\n\nTypes\n\n.f16/ bf16\n\n.f16x2/ bf16x2\n\n.f32\n\n\n\n.v2\n\n.add, .min, .max\n\n.add, .min, .max\n\n.add\n\n.v4\n\n.add, .min, .max\n\n.add, .min, .max\n\n.add\n\n.v8\n\n.add, .min, .max\n\nNot supported\n\nNot Supported\n\nTwo atomic operations {atom or red} are performed atomically with respect to each other only\n\nif each operation specifies a scope that includes the other. When this condition is not met, each\n\noperation observes the other operation being performed as if it were split into a read followed by a\n\ndep ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom"
            };

        case "bar":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier\" target=\"_blank\" rel=\"noopener noreferrer\">bar <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync\" target=\"_blank\" rel=\"noopener noreferrer\">bar.warp.sync <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: bar, barrier</h1><section id=\"parallel-synchronization-and-communication-instructions-bar-barrier\">\n<span id=\"parallel-synchronization-and-communication-instructions-bar\"></span>\n\n<p>Barrier synchronization.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>barrier{.cta}.sync{.aligned}      a{, b};\nbarrier{.cta}.arrive{.aligned}    a, b;\n\nbarrier{.cta}.red.popc{.aligned}.u32  d, a{, b}, {!}c;\nbarrier{.cta}.red.op{.aligned}.pred   p, a{, b}, {!}c;\n\nbar{.cta}.sync      a{, b};\nbar{.cta}.arrive    a, b;\n\nbar{.cta}.red.popc.u32  d, a{, b}, {!}c;\nbar{.cta}.red.op.pred   p, a{, b}, {!}c;\n\n.op = { .and, .or };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs barrier synchronization and communication within a CTA. Each CTA instance has sixteen\nbarriers numbered <code class=\"docutils literal notranslate\"><span class=\"pre\">0..15</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions can be used by the threads within the CTA for synchronization and\ncommunication.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>; operands <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> are predicates. Source\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> specifies a logical barrier resource as an immediate constant or register with value\n<code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> through <code class=\"docutils literal notranslate\"><span class=\"pre\">15</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> specifies the number of threads participating in the barrier. If\nno thread count is specified, all threads in the CTA participate in the barrier. When specifying a\nthread count, the value must be a multiple of the warp size. Note that a non-zero thread count is\nrequired for <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code>.</p>\n<p>Depending on operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, either specified number of threads (in multiple of warp size) or all\nthreads in the CTA participate in <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction. The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions\nsignal the arrival of the executing threads at the named barrier.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction causes executing thread to wait for all non-exited threads from its\nwarp and marks warps\u2019 arrival at barrier. In addition to signaling its arrival at the barrier, the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> instructions causes executing thread to wait for\nnon-exited threads of all other warps participating in the barrier to\narrive. <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> does not cause executing thread to wait for threads of other\nparticipating warps.</p>\n<p>When a barrier completes, the waiting threads are restarted without delay, and the barrier is\nreinitialized so that it can be immediately reused.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> instruction\nguarantees that when the barrier completes, prior memory accesses requested by this thread are\nperformed relative to all threads participating in the barrier. The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> instruction further guarantees that no new memory access is requested by this\nthread before the barrier completes.</p>\n<p>A memory read (e.g., by <code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code>) has been performed when the value read has been\ntransmitted from memory and cannot be modified by another thread participating in the barrier. A\nmemory write (e.g., by <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code>) has been performed when the value written has\nbecome visible to other threads participating in the barrier, that is, when the previous value can\nno longer be read.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> performs a reduction operation across threads. The <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> predicate (or its\ncomplement) from all threads in the CTA are combined using the specified reduction operator. Once\nthe barrier count is reached, the final value is written to the destination register in all threads\nwaiting at the barrier.</p>\n<p>The reduction operations for <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> are population-count (<code class=\"docutils literal notranslate\"><span class=\"pre\">.popc</span></code>),\nall-threads-True (<code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>), and any-thread-True (<code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>). The result of <code class=\"docutils literal notranslate\"><span class=\"pre\">.popc</span></code> is the number of\nthreads with a <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> predicate, while <code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code> indicate if all the threads had a\n<code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> predicate or if any of the threads had a <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> predicate.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> has optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> modifier. When specified, it indicates that\nall threads in CTA will execute the same <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction. In conditionally executed\ncode, an aligned <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction should only be used if it is known that all threads\nin CTA evaluate the condition identically, otherwise behavior is undefined.</p>\n<p>Different warps may execute different forms of the <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction using the same\nbarrier name and thread count. One example mixes <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code>\nto implement producer/consumer models. The producer threads execute <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> to\nannounce their arrival at the barrier and continue execution without delay to produce the next\nvalue, while the consumer threads execute the <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> to wait for a resource to be\nproduced. The roles are then reversed, using a different barrier, where the producer threads execute\na <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> to wait for a resource to consumed, while the consumer threads announce\nthat the resource has been consumed with <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code>. Care must be taken to keep a warp\nfrom executing more <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions than intended (<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> followed\nby any other <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction to the same barrier) prior to the reset of the\nbarrier. <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> should not be intermixed with <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> using the same active barrier. Execution in this case is unpredictable.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code> qualifier simply indicates CTA-level applicability of the barrier and it\ndoesn\u2019t change the semantics of the instruction.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.sync</span></code> is equivalent to <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync.aligned</span></code>. <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.arrive</span></code> is\nequivalent to <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive.aligned</span></code>. <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.red</span></code> is equivalent to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red.aligned</span></code>.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p>For .target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code> or below,</p>\n<ol class=\"arabic simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction without <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> modifier is equivalent to <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code>\nvariant and has the same restrictions as of <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> variant.</p></li>\n<li><p>All threads in warp (except for those have exited) must execute <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction\nin convergence.</p></li>\n</ol>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bar.sync</span></code> without a thread count introduced in PTX ISA version 1.0.</p>\n<p>Register operands, thread count, and <code class=\"docutils literal notranslate\"><span class=\"pre\">bar.{arrive,red}</span></code> introduced in PTX ISA version 2.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier</span></code> instruction introduced in PTX ISA version 6.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code> qualifier introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Register operands, thread count, and <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.{arrive,red}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Only <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.sync</span></code> with an immediate barrier number is supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code> targets.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Use bar.sync to arrive at a pre-computed barrier number and\n// wait for all threads in CTA to also arrive:\n    st.shared [r0],r1;  // write my result to shared memory\n    bar.cta.sync  1;    // arrive, wait for others to arrive\n    ld.shared r2,[r3];  // use shared results from other threads\n\n// Use bar.sync to arrive at a pre-computed barrier number and\n// wait for fixed number of cooperating threads to arrive:\n    #define CNT1 (8*12) // Number of cooperating threads\n\n    st.shared [r0],r1;     // write my result to shared memory\n    bar.cta.sync  1, CNT1; // arrive, wait for others to arrive\n    ld.shared r2,[r3];     // use shared results from other threads\n\n// Use bar.red.and to compare results across the entire CTA:\n    setp.eq.u32 p,r1,r2;         // p is True if r1==r2\n    bar.cta.red.and.pred r3,1,p; // r3=AND(p) forall threads in CTA\n\n// Use bar.red.popc to compute the size of a group of threads\n// that have a specific condition True:\n    setp.eq.u32 p,r1,r2;         // p is True if r1==r2\n    bar.cta.red.popc.u32 r3,1,p; // r3=SUM(p) forall threads in CTA\n\n/* Producer/consumer model. The producer deposits a value in\n * shared memory, signals that it is complete but does not wait\n * using bar.arrive, and begins fetching more data from memory.\n * Once the data returns from memory, the producer must wait\n * until the consumer signals that it has read the value from\n * the shared memory location. In the meantime, a consumer\n * thread waits until the data is stored by the producer, reads\n * it, and then signals that it is done (without waiting).\n */\n    // Producer code places produced value in shared memory.\n    st.shared   [r0],r1;\n    bar.arrive  0,64;\n    ld.global   r1,[r2];\n    bar.sync    1,64;\n    ...\n\n    // Consumer code, reads value from shared memory\n    bar.sync   0,64;\n    ld.shared  r1,[r0];\n    bar.arrive 1,64;\n    ...\n\n    // Examples of barrier.cta.sync\n    st.shared         [r0],r1;\n    barrier.cta.sync  0;\n    ld.shared         r1, [r0];\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: bar.warp.sync</h1><section id=\"parallel-synchronization-and-communication-instructions-bar-warp-sync\">\n\n\n<p>Barrier synchronization for threads in a warp.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>bar.warp.sync      membermask;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code> will cause executing thread to wait until all threads corresponding to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> have executed a <code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code> with the same <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> value before resuming\nexecution.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> specifies a 32-bit integer which is a mask indicating threads participating\nin barrier where the bit position corresponds to thread\u2019s <code class=\"docutils literal notranslate\"><span class=\"pre\">laneid</span></code>.</p>\n<p>The behavior of <code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code> is undefined if the executing thread is not in the <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code> also guarantee memory ordering among threads participating in barrier. Thus,\nthreads within warp that wish to communicate via memory can store to memory, execute\n<code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code>, and then safely read values stored by other threads in warp.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p>For .target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code> or below, all threads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> must execute the same\n<code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code> instruction in convergence, and only threads belonging to some <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>\ncan be active when the <code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code> instruction is executed. Otherwise, the behavior is\nundefined.</p>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>st.shared.u32 [r0],r1;         // write my result to shared memory\nbar.warp.sync  0xffffffff;     // arrive, wait for others to arrive\nld.shared.u32 r2,[r3];         // read results written by other threads\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Parallel Synchronization and Communication Instructions: bar, barrier\n\n\n\nBarrier synchronization.\n\nSyntax\n\nbarrier{.cta}.sync{.aligned}      a{, b};\n\nbarrier{.cta}.arrive{.aligned}    a, b;\n\nbarrier{.cta}.red.popc{.aligned}.u32  d, a{, b}, {!}c;\n\nbarrier{.cta}.red.op{.aligned}.pred   p, a{, b}, {!}c;\n\nbar{.cta}.sync      a{, b};\n\nbar{.cta}.arrive    a, b;\n\nbar{.cta}.red.popc.u32  d, a{, b}, {!}c;\n\nbar{.cta}.red.op.pred   p, a{, b}, {!}c;\n\n.op = { .and, .or };\n\nDescription\n\nPerform...\n\n=====Parallel Synchronization and Communication Instructions: bar.warp.sync\n\n\n\nBarrier synchronization for threads in a warp.\n\nSyntax\n\nbar.warp.sync      membermask;\n\nDescription\n\nbar.warp.sync will cause executing thread to wait until all threads corresponding to\n\nmembermask have executed a bar.warp.sync with the same membermask value before resuming\n\nexecution.\n\nOperand membermask specifies a 32-bit integer which is a mask indicating threads participating\n\nin barrier where the bit... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier"
            };

        case "barrier":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier\" target=\"_blank\" rel=\"noopener noreferrer\">barrier <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster\" target=\"_blank\" rel=\"noopener noreferrer\">barrier.cluster <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: bar, barrier</h1><section id=\"parallel-synchronization-and-communication-instructions-bar-barrier\">\n<span id=\"parallel-synchronization-and-communication-instructions-bar\"></span>\n\n<p>Barrier synchronization.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>barrier{.cta}.sync{.aligned}      a{, b};\nbarrier{.cta}.arrive{.aligned}    a, b;\n\nbarrier{.cta}.red.popc{.aligned}.u32  d, a{, b}, {!}c;\nbarrier{.cta}.red.op{.aligned}.pred   p, a{, b}, {!}c;\n\nbar{.cta}.sync      a{, b};\nbar{.cta}.arrive    a, b;\n\nbar{.cta}.red.popc.u32  d, a{, b}, {!}c;\nbar{.cta}.red.op.pred   p, a{, b}, {!}c;\n\n.op = { .and, .or };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs barrier synchronization and communication within a CTA. Each CTA instance has sixteen\nbarriers numbered <code class=\"docutils literal notranslate\"><span class=\"pre\">0..15</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions can be used by the threads within the CTA for synchronization and\ncommunication.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>; operands <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> are predicates. Source\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> specifies a logical barrier resource as an immediate constant or register with value\n<code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> through <code class=\"docutils literal notranslate\"><span class=\"pre\">15</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> specifies the number of threads participating in the barrier. If\nno thread count is specified, all threads in the CTA participate in the barrier. When specifying a\nthread count, the value must be a multiple of the warp size. Note that a non-zero thread count is\nrequired for <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code>.</p>\n<p>Depending on operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, either specified number of threads (in multiple of warp size) or all\nthreads in the CTA participate in <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction. The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions\nsignal the arrival of the executing threads at the named barrier.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction causes executing thread to wait for all non-exited threads from its\nwarp and marks warps\u2019 arrival at barrier. In addition to signaling its arrival at the barrier, the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> instructions causes executing thread to wait for\nnon-exited threads of all other warps participating in the barrier to\narrive. <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> does not cause executing thread to wait for threads of other\nparticipating warps.</p>\n<p>When a barrier completes, the waiting threads are restarted without delay, and the barrier is\nreinitialized so that it can be immediately reused.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> instruction\nguarantees that when the barrier completes, prior memory accesses requested by this thread are\nperformed relative to all threads participating in the barrier. The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> instruction further guarantees that no new memory access is requested by this\nthread before the barrier completes.</p>\n<p>A memory read (e.g., by <code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code>) has been performed when the value read has been\ntransmitted from memory and cannot be modified by another thread participating in the barrier. A\nmemory write (e.g., by <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code>) has been performed when the value written has\nbecome visible to other threads participating in the barrier, that is, when the previous value can\nno longer be read.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> performs a reduction operation across threads. The <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> predicate (or its\ncomplement) from all threads in the CTA are combined using the specified reduction operator. Once\nthe barrier count is reached, the final value is written to the destination register in all threads\nwaiting at the barrier.</p>\n<p>The reduction operations for <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> are population-count (<code class=\"docutils literal notranslate\"><span class=\"pre\">.popc</span></code>),\nall-threads-True (<code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>), and any-thread-True (<code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>). The result of <code class=\"docutils literal notranslate\"><span class=\"pre\">.popc</span></code> is the number of\nthreads with a <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> predicate, while <code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code> indicate if all the threads had a\n<code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> predicate or if any of the threads had a <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> predicate.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> has optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> modifier. When specified, it indicates that\nall threads in CTA will execute the same <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction. In conditionally executed\ncode, an aligned <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction should only be used if it is known that all threads\nin CTA evaluate the condition identically, otherwise behavior is undefined.</p>\n<p>Different warps may execute different forms of the <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction using the same\nbarrier name and thread count. One example mixes <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code>\nto implement producer/consumer models. The producer threads execute <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> to\nannounce their arrival at the barrier and continue execution without delay to produce the next\nvalue, while the consumer threads execute the <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> to wait for a resource to be\nproduced. The roles are then reversed, using a different barrier, where the producer threads execute\na <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> to wait for a resource to consumed, while the consumer threads announce\nthat the resource has been consumed with <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code>. Care must be taken to keep a warp\nfrom executing more <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions than intended (<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> followed\nby any other <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction to the same barrier) prior to the reset of the\nbarrier. <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> should not be intermixed with <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> using the same active barrier. Execution in this case is unpredictable.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code> qualifier simply indicates CTA-level applicability of the barrier and it\ndoesn\u2019t change the semantics of the instruction.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.sync</span></code> is equivalent to <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync.aligned</span></code>. <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.arrive</span></code> is\nequivalent to <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive.aligned</span></code>. <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.red</span></code> is equivalent to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red.aligned</span></code>.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p>For .target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code> or below,</p>\n<ol class=\"arabic simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction without <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> modifier is equivalent to <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code>\nvariant and has the same restrictions as of <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> variant.</p></li>\n<li><p>All threads in warp (except for those have exited) must execute <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction\nin convergence.</p></li>\n</ol>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bar.sync</span></code> without a thread count introduced in PTX ISA version 1.0.</p>\n<p>Register operands, thread count, and <code class=\"docutils literal notranslate\"><span class=\"pre\">bar.{arrive,red}</span></code> introduced in PTX ISA version 2.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier</span></code> instruction introduced in PTX ISA version 6.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code> qualifier introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Register operands, thread count, and <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.{arrive,red}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Only <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.sync</span></code> with an immediate barrier number is supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code> targets.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Use bar.sync to arrive at a pre-computed barrier number and\n// wait for all threads in CTA to also arrive:\n    st.shared [r0],r1;  // write my result to shared memory\n    bar.cta.sync  1;    // arrive, wait for others to arrive\n    ld.shared r2,[r3];  // use shared results from other threads\n\n// Use bar.sync to arrive at a pre-computed barrier number and\n// wait for fixed number of cooperating threads to arrive:\n    #define CNT1 (8*12) // Number of cooperating threads\n\n    st.shared [r0],r1;     // write my result to shared memory\n    bar.cta.sync  1, CNT1; // arrive, wait for others to arrive\n    ld.shared r2,[r3];     // use shared results from other threads\n\n// Use bar.red.and to compare results across the entire CTA:\n    setp.eq.u32 p,r1,r2;         // p is True if r1==r2\n    bar.cta.red.and.pred r3,1,p; // r3=AND(p) forall threads in CTA\n\n// Use bar.red.popc to compute the size of a group of threads\n// that have a specific condition True:\n    setp.eq.u32 p,r1,r2;         // p is True if r1==r2\n    bar.cta.red.popc.u32 r3,1,p; // r3=SUM(p) forall threads in CTA\n\n/* Producer/consumer model. The producer deposits a value in\n * shared memory, signals that it is complete but does not wait\n * using bar.arrive, and begins fetching more data from memory.\n * Once the data returns from memory, the producer must wait\n * until the consumer signals that it has read the value from\n * the shared memory location. In the meantime, a consumer\n * thread waits until the data is stored by the producer, reads\n * it, and then signals that it is done (without waiting).\n */\n    // Producer code places produced value in shared memory.\n    st.shared   [r0],r1;\n    bar.arrive  0,64;\n    ld.global   r1,[r2];\n    bar.sync    1,64;\n    ...\n\n    // Consumer code, reads value from shared memory\n    bar.sync   0,64;\n    ld.shared  r1,[r0];\n    bar.arrive 1,64;\n    ...\n\n    // Examples of barrier.cta.sync\n    st.shared         [r0],r1;\n    barrier.cta.sync  0;\n    ld.shared         r1, [r0];\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: barrier.cluster</h1><section id=\"parallel-synchronization-and-communication-instructions-barrier-cluster\">\n\n\n<p>Barrier synchronization within a cluster.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>barrier.cluster.arrive{.sem}{.aligned};\nbarrier.cluster.wait{.acquire}{.aligned};\n\n.sem = {.release, .relaxed}\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs barrier synchronization and communication within a cluster.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster</span></code> instructions can be used by the threads within the cluster for synchronization\nand communication.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code> instruction marks warps\u2019 arrival at barrier without causing executing\nthread to wait for threads of other participating warps.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.wait</span></code> instruction causes the executing thread to wait for all non-exited threads\nof the cluster to perform <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code>.</p>\n<p>In addition, <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster</span></code> instructions cause the executing thread to wait for all non-exited\nthreads from its warp.</p>\n<p>When all non-exited threads that executed <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code> have executed\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.wait</span></code>, the barrier completes and is reinitialized so it can be reused\nimmediately. Each thread must arrive at the barrier only once before the barrier completes.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.wait</span></code> instruction guarantees that when it completes the execution, memory\naccesses (except asynchronous operations) requested, in program order, prior to the preceding\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code> by all threads in the cluster are complete and visible to the executing\nthread.</p>\n<p>There is no memory ordering and visibility guarantee for memory accesses requested by the executing\nthread, in program order, after <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code> and prior to <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.wait</span></code>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> qualifier on <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code> specifies that there are no memory\nordering and visibility guarantees provided for the memory accesses performed prior to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code> qualifiers on instructions <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.wait</span></code> specify the memory synchronization as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency\nModel</a>. If the optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier is absent for\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> is assumed by default. If the optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code>\nqualifier is absent for <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.wait</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code> is assumed by default.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> qualifier indicates that all threads in the warp must execute the same\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster</span></code> instruction. In conditionally executed code, an aligned <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster</span></code>\ninstruction should only be used if it is known that all threads in the warp evaluate the condition\nidentically, otherwise behavior is undefined.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> qualifiers introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// use of arrive followed by wait\nld.shared::cluster.u32 r0, [addr];\nbarrier.cluster.arrive.aligned;\n...\nbarrier.cluster.wait.aligned;\nst.shared::cluster.u32 [addr], r1;\n\n// use memory fence prior to arrive for relaxed barrier\n@cta0 ld.shared::cluster.u32 r0, [addr];\nfence.cluster.acq_rel;\nbarrier.cluster.arrive.relaxed.aligned;\n...\nbarrier.cluster.wait.aligned;\n@cta1 st.shared::cluster.u32 [addr], r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Parallel Synchronization and Communication Instructions: bar, barrier\n\n\n\nBarrier synchronization.\n\nSyntax\n\nbarrier{.cta}.sync{.aligned}      a{, b};\n\nbarrier{.cta}.arrive{.aligned}    a, b;\n\nbarrier{.cta}.red.popc{.aligned}.u32  d, a{, b}, {!}c;\n\nbarrier{.cta}.red.op{.aligned}.pred   p, a{, b}, {!}c;\n\nbar{.cta}.sync      a{, b};\n\nbar{.cta}.arrive    a, b;\n\nbar{.cta}.red.popc.u32  d, a{, b}, {!}c;\n\nbar{.cta}.red.op.pred   p, a{, b}, {!}c;\n\n.op = { .and, .or };\n\nDescription\n\nPerform...\n\n=====Parallel Synchronization and Communication Instructions: barrier.cluster\n\n\n\nBarrier synchronization within a cluster.\n\nSyntax\n\nbarrier.cluster.arrive{.sem}{.aligned};\n\nbarrier.cluster.wait{.acquire}{.aligned};\n\n.sem = {.release, .relaxed}\n\nDescription\n\nPerforms barrier synchronization and communication within a cluster.\n\nbarrier.cluster instructions can be used by the threads within the cluster for synchronization\n\nand communication.\n\nbarrier.cluster.arrive instruction marks warps... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier"
            };

        case "bfe":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe\" target=\"_blank\" rel=\"noopener noreferrer\">bfe(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: bfe</h1><section id=\"integer-arithmetic-instructions-bfe\">\n\n\n<p>Bit Field Extract.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>bfe.type  d, a, b, c;\n\n.type = { .u32, .u64,\n          .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Extract bit field from <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and place the zero or sign-extended result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. Source <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> gives\nthe bit field starting bit position, and source <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> gives the bit field length in bits.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> have the same type as the instruction type. Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> are\ntype <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, but are restricted to the 8-bit value range <code class=\"docutils literal notranslate\"><span class=\"pre\">0..255</span></code>.</p>\n<p>The sign bit of the extracted field is defined as:</p>\n<dl class=\"simple\">\n<dt>\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>:</dt>\n<dd>\n<p>zero</p>\n</dd>\n<dt>\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code>:</dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">msb</span></code> of input a if the extracted field extends beyond the <code class=\"docutils literal notranslate\"><span class=\"pre\">msb</span></code> of a <code class=\"docutils literal notranslate\"><span class=\"pre\">msb</span></code> of extracted\nfield, otherwise</p>\n</dd>\n</dl>\n<p>If the bit field length is zero, the result is zero.</p>\n<p>The destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is padded with the sign bit of the extracted field. If the start position is\nbeyond the <code class=\"docutils literal notranslate\"><span class=\"pre\">msb</span></code> of the input, the destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is filled with the replicated sign bit of the\nextracted field.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>msb = (.type==.u32 || .type==.s32) ? 31 : 63;\npos = b &amp; 0xff;  // pos restricted to 0..255 range\nlen = c &amp; 0xff;  // len restricted to 0..255 range\n\nif (.type==.u32 || .type==.u64 || len==0)\n    sbit = 0;\nelse\n    sbit = a[min(pos+len-1,msb)];\n\nd = 0;\nfor (i=0; i&lt;=msb; i++) {\n    d[i] = (i&lt;len &amp;&amp; pos+i&lt;=msb) ? a[pos+i] : sbit;\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bfe</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>bfe.b32  d,a,start,len;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Bit Field Extract.\n\nSyntax\n\nbfe.type  d, a, b, c;\n\n.type = { .u32, .u64,\n\n          .s32, .s64 };\n\nDescription\n\nExtract bit field from a and place the zero or sign-extended result in d. Source b gives\n\nthe bit field starting bit position, and source c gives the bit field length in bits.\n\nOperands a and d have the same type as the instruction type. Operands b and c are\n\ntype .u32, but are restricted to the 8-bit value range 0..255.\n\nThe sign bit of the extracted field is defined as:\n\n.u32, .u64:\n\nzero\n\n.s32, .s64:\n\nmsb of input a if the extracted field extends beyond the msb of a msb of extracted\n\nfield, otherwise\n\nIf the bit field length is zero, the result is zero.\n\nThe destination d is padded with the sign bit of the extracted field. If the start position is\n\nbeyond the msb of the input, the destination d is filled with the replicated sign bit of the\n\nextracted field.\n\nSemantics\n\nmsb = (.type==.u32 || .type==.s32) ? 31 : 63;\n\npos = b & 0xff;  // pos restricted to 0..255 range\n\nlen = c & 0xff;  // len restricted to 0..255 range\n\nif (.type==.u32 || .type==.u64 || len==0)\n\n    sbit = 0;\n\nelse\n\n    sbit = a[min(pos+len-1,msb)];\n\nd = 0;\n\nfor (i=0; i<=msb; i++) {\n\n    d[i] = (i<len && pos+i<=msb) ? a[pos+i] : sbit;\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nbfe requires sm_20 or higher.\n\nExamples\n\nbfe.b32  d,a,start,len;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe"
            };

        case "bfi":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi\" target=\"_blank\" rel=\"noopener noreferrer\">bfi(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: bfi</h1><section id=\"integer-arithmetic-instructions-bfi\">\n\n\n<p>Bit Field Insert.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>bfi.type  f, a, b, c, d;\n\n.type = { .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Align and insert a bit field from <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> into <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and place the result in <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code>. Source <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>\ngives the starting bit position for the insertion, and source <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> gives the bit field length in\nbits.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> have the same type as the instruction type. Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> are type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, but are restricted to the 8-bit value range <code class=\"docutils literal notranslate\"><span class=\"pre\">0..255</span></code>.</p>\n<p>If the bit field length is zero, the result is <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p>If the start position is beyond the msb of the input, the result is <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>msb = (.type==.b32) ? 31 : 63;\npos = c &amp; 0xff;  // pos restricted to 0..255 range\nlen = d &amp; 0xff;  // len restricted to 0..255 range\n\nf = b;\nfor (i=0; i&lt;len &amp;&amp; pos+i&lt;=msb; i++) {\n    f[pos+i] = a[i];\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bfi</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>bfi.b32  d,a,b,start,len;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Bit Field Insert.\n\nSyntax\n\nbfi.type  f, a, b, c, d;\n\n.type = { .b32, .b64 };\n\nDescription\n\nAlign and insert a bit field from a into b, and place the result in f. Source c\n\ngives the starting bit position for the insertion, and source d gives the bit field length in\n\nbits.\n\nOperands a, b, and f have the same type as the instruction type. Operands c and\n\nd are type .u32, but are restricted to the 8-bit value range 0..255.\n\nIf the bit field length is zero, the result is b.\n\nIf the start position is beyond the msb of the input, the result is b.\n\nSemantics\n\nmsb = (.type==.b32) ? 31 : 63;\n\npos = c & 0xff;  // pos restricted to 0..255 range\n\nlen = d & 0xff;  // len restricted to 0..255 range\n\nf = b;\n\nfor (i=0; i<len && pos+i<=msb; i++) {\n\n    f[pos+i] = a[i];\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nbfi requires sm_20 or higher.\n\nExamples\n\nbfi.b32  d,a,b,start,len;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi"
            };

        case "bfind":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind\" target=\"_blank\" rel=\"noopener noreferrer\">bfind(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: bfind</h1><section id=\"integer-arithmetic-instructions-bfind\">\n\n\n<p>Find most significant non-sign bit.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>bfind.type           d, a;\nbfind.shiftamt.type  d, a;\n\n.type = { .u32, .u64,\n          .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Find the bit position of the most significant non-sign bit in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and place the result in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> has the instruction type, and destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>. For unsigned\nintegers, <code class=\"docutils literal notranslate\"><span class=\"pre\">bfind</span></code> returns the bit position of the most significant <code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code>. For signed integers,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">bfind</span></code> returns the bit position of the most significant <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> for negative inputs and the most\nsignificant <code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code> for non-negative inputs.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.shiftamt</span></code> is specified, <code class=\"docutils literal notranslate\"><span class=\"pre\">bfind</span></code> returns the shift amount needed to left-shift the found bit\ninto the most-significant bit position.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bfind</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">0xffffffff</span></code> if no non-sign bit is found.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>msb = (.type==.u32 || .type==.s32) ? 31 : 63;\n// negate negative signed inputs\nif ( (.type==.s32 || .type==.s64) &amp;&amp; (a &amp; (1&lt;&lt;msb)) ) {\n    a = ~a;\n}\n.u32  d = 0xffffffff;\nfor (.s32 i=msb; i&gt;=0; i--) {\n    if (a &amp; (1&lt;&lt;i))  { d = i; break; }\n}\nif (.shiftamt &amp;&amp; d != 0xffffffff)  { d = msb - d; }\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bfind</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>bfind.u32  d, a;\nbfind.shiftamt.s64  cnt, X;  // cnt is .u32\n</pre></div>\n</div>\n</section>",
                "tooltip": "Find most significant non-sign bit.\n\nSyntax\n\nbfind.type           d, a;\n\nbfind.shiftamt.type  d, a;\n\n.type = { .u32, .u64,\n\n          .s32, .s64 };\n\nDescription\n\nFind the bit position of the most significant non-sign bit in a and place the result in\n\nd. Operand a has the instruction type, and destination d has type .u32. For unsigned\n\nintegers, bfind returns the bit position of the most significant 1. For signed integers,\n\nbfind returns the bit position of the most significant 0 for negative inputs and the most\n\nsignificant 1 for non-negative inputs.\n\nIf .shiftamt is specified, bfind returns the shift amount needed to left-shift the found bit\n\ninto the most-significant bit position.\n\nbfind returns 0xffffffff if no non-sign bit is found.\n\nSemantics\n\nmsb = (.type==.u32 || .type==.s32) ? 31 : 63;\n\n// negate negative signed inputs\n\nif ( (.type==.s32 || .type==.s64) && (a & (1<<msb)) ) {\n\n    a = ~a;\n\n}\n\n.u32  d = 0xffffffff;\n\nfor (.s32 i=msb; i>=0; i--) {\n\n    if (a & (1<<i))  { d = i; break; }\n\n}\n\nif (.shiftamt && d != 0xffffffff)  { d = msb - d; }\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nbfind requires sm_20 or higher.\n\nExamples\n\nbfind.u32  d, a;\n\nbfind.shiftamt.s64  cnt, X;  // cnt is .u32\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind"
            };

        case "bmsk":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk\" target=\"_blank\" rel=\"noopener noreferrer\">bmsk(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: bmsk</h1><section id=\"integer-arithmetic-instructions-bmsk\">\n\n\n<p>Bit Field Mask.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>bmsk.mode.b32  d, a, b;\n\n.mode = { .clamp, .wrap };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Generates a 32-bit mask starting from the bit position specified in operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, and of the width\nspecified in operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>. The generated bitmask is stored in the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>The resulting bitmask is 0 in the following cases:</p>\n<ul class=\"simple\">\n<li><p>When the value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is 32 or higher and <code class=\"docutils literal notranslate\"><span class=\"pre\">.mode</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.clamp</span></code>.</p></li>\n<li><p>When either the specified value of <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> or the wrapped value of <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> (when <code class=\"docutils literal notranslate\"><span class=\"pre\">.mode</span></code> is\nspecified as <code class=\"docutils literal notranslate\"><span class=\"pre\">.wrap</span></code>) is 0.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>a1    = a &amp; 0x1f;\nmask0 = (~0) &lt;&lt; a1;\nb1    = b &amp; 0x1f;\nsum   = a1 + b1;\nmask1 = (~0) &lt;&lt; sum;\n\nsum-overflow          = sum &gt;= 32 ? true : false;\nbit-position-overflow = false;\nbit-width-overflow    = false;\n\nif (.mode == .clamp) {\n    if (a &gt;= 32) {\n        bit-position-overflow = true;\n        mask0 = 0;\n    }\n    if (b &gt;= 32) {\n        bit-width-overflow = true;\n    }\n}\n\nif (sum-overflow || bit-position-overflow || bit-width-overflow) {\n    mask1 = 0;\n} else if (b1 == 0) {\n    mask1 = ~0;\n}\nd = mask0 &amp; ~mask1;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The bitmask width specified by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is limited to range <code class=\"docutils literal notranslate\"><span class=\"pre\">0..32</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">.clamp</span></code> mode and to\nrange <code class=\"docutils literal notranslate\"><span class=\"pre\">0..31</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">.wrap</span></code> mode.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.6.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bmsk</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>bmsk.clamp.b32  rd, ra, rb;\nbmsk.wrap.b32   rd, 1, 2; // Creates a bitmask of 0x00000006.\n</pre></div>\n</div>\n</section>",
                "tooltip": "Bit Field Mask.\n\nSyntax\n\nbmsk.mode.b32  d, a, b;\n\n.mode = { .clamp, .wrap };\n\nDescription\n\nGenerates a 32-bit mask starting from the bit position specified in operand a, and of the width\n\nspecified in operand b. The generated bitmask is stored in the destination operand d.\n\nThe resulting bitmask is 0 in the following cases:\n\nWhen the value of a is 32 or higher and .mode is .clamp.\n\nWhen either the specified value of b or the wrapped value of b (when .mode is\n\nspecified as .wrap) is 0.\n\nSemantics\n\na1    = a & 0x1f;\n\nmask0 = (~0) << a1;\n\nb1    = b & 0x1f;\n\nsum   = a1 + b1;\n\nmask1 = (~0) << sum;\n\nsum-overflow          = sum >= 32 ? true : false;\n\nbit-position-overflow = false;\n\nbit-width-overflow    = false;\n\nif (.mode == .clamp) {\n\n    if (a >= 32) {\n\n        bit-position-overflow = true;\n\n        mask0 = 0;\n\n    }\n\n    if (b >= 32) {\n\n        bit-width-overflow = true;\n\n    }\n\n}\n\nif (sum-overflow || bit-position-overflow || bit-width-overflow) {\n\n    mask1 = 0;\n\n} else if (b1 == 0) {\n\n    mask1 = ~0;\n\n}\n\nd = mask0 & ~mask1;\n\nNotes\n\nThe bitmask width specified by operand b is limited to range 0..32 in .clamp mode and to\n\nrange 0..31 in .wrap mode.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.6.\n\nTarget ISA Notes\n\nbmsk requires sm_70 or higher.\n\nExamples\n\nbmsk.clamp.b32  rd, ra, rb;\n\nbmsk.wrap.b32   rd, 1, 2; // Creates a bitmask of 0x00000006.\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk"
            };

        case "bra":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra\" target=\"_blank\" rel=\"noopener noreferrer\">bra <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Instructions: bra</h1><section id=\"control-flow-instructions-bra\">\n\n\n<p>Branch to a target and continue execution there.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>@p   bra{.uni}  tgt;           // tgt is a label\n     bra{.uni}  tgt;           // unconditional branch\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Continue execution at the target. Conditional branches are specified by using a guard predicate. The\nbranch target must be a label.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bra.uni</span></code> is guaranteed to be non-divergent, i.e. all active threads in a warp that are currently\nexecuting this instruction have identical values for the guard predicate and branch target.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (p) {\n    pc = tgt;\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p>Unimplemented indirect branch introduced in PTX ISA version 2.1 has been removed from the spec.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>bra.uni  L_exit;    // uniform unconditional jump\n@q  bra      L23;   // conditional branch\n</pre></div>\n</div>\n</section>",
                "tooltip": "Branch to a target and continue execution there.\n\nSyntax\n\n@p   bra{.uni}  tgt;           // tgt is a label\n\n     bra{.uni}  tgt;           // unconditional branch\n\nDescription\n\nContinue execution at the target. Conditional branches are specified by using a guard predicate. The\n\nbranch target must be a label.\n\nbra.uni is guaranteed to be non-divergent, i.e. all active threads in a warp that are currently\n\nexecuting this instruction have identical values for the guard predicate and branch target.\n\nSemantics\n\nif (p) {\n\n    pc = tgt;\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nUnimplemented indirect branch introduced in PTX ISA version 2.1 has been removed from the spec.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nbra.uni  L_exit;    // uniform unconditional jump\n\n@q  bra      L23;   // conditional branch\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra"
            };

        case "branchtargets":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-branchtargets\" target=\"_blank\" rel=\"noopener noreferrer\">branchtargets <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Directives: .branchtargets</h1><section id=\"control-flow-directives-branchtargets\">\n\n\n<p>Declare a list of potential branch targets.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>Label:   .branchtargets  list-of-labels ;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares a list of potential branch targets for a subsequent <code class=\"docutils literal notranslate\"><span class=\"pre\">brx.idx</span></code>, and associates the list\nwith the label at the start of the line.</p>\n<p>All control flow labels in the list must occur within the same function as the declaration.</p>\n<p>The list of labels may use the compact, shorthand syntax for enumerating a range of labels having a\ncommon prefix, similar to the syntax described in <a class=\"reference external\" href=\"#parameterized-variable-names\">Parameterized Variable Names</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>  .function foo () {\n      .reg .u32 %r0;\n      ...\n      L1:\n      ...\n      L2:\n      ...\n      L3:\n      ...\n      ts: .branchtargets L1, L2, L3;\n      @p brx.idx %r0, ts;\n      ...\n\n.function bar() {\n      .reg .u32 %r0;\n      ...\n      N0:\n      ...\n      N1:\n      ...\n      N2:\n      ...\n      N3:\n      ...\n      N4:\n      ...\n      ts: .branchtargets N&lt;5&gt;;\n      @p brx.idx %r0, ts;\n      ...\n</pre></div>\n</div>\n</section>",
                "tooltip": "Declare a list of potential branch targets.\n\nSyntax\n\nLabel:   .branchtargets  list-of-labels ;\n\nDescription\n\nDeclares a list of potential branch targets for a subsequent brx.idx, and associates the list\n\nwith the label at the start of the line.\n\nAll control flow labels in the list must occur within the same function as the declaration.\n\nThe list of labels may use the compact, shorthand syntax for enumerating a range of labels having a\n\ncommon prefix, similar to the syntax described in Parameterized Variable Names.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\n  .function foo () {\n\n      .reg .u32 %r0;\n\n      ...\n\n      L1:\n\n      ...\n\n      L2:\n\n      ...\n\n      L3:\n\n      ...\n\n      ts: .branchtargets L1, L2, L3;\n\n      @p brx.idx %r0, ts;\n\n      ...\n\n.function bar() {\n\n      .reg .u32 %r0;\n\n      ...\n\n      N0:\n\n      ...\n\n      N1:\n\n      ...\n\n      N2:\n\n      ...\n\n      N3:\n\n      ...\n\n      N4:\n\n      ...\n\n      ts: .branchtargets N<5>;\n\n      @p brx.idx %r0, ts;\n\n      ...\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-branchtargets"
            };

        case "brev":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev\" target=\"_blank\" rel=\"noopener noreferrer\">brev(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: brev</h1><section id=\"integer-arithmetic-instructions-brev\">\n\n\n<p>Bit reverse.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>brev.type  d, a;\n\n.type = { .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Perform bitwise reversal of input.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>msb = (.type==.b32) ? 31 : 63;\n\nfor (i=0; i&lt;=msb; i++) {\n    d[i] = a[msb-i];\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">brev</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>brev.b32  d, a;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Bit reverse.\n\nSyntax\n\nbrev.type  d, a;\n\n.type = { .b32, .b64 };\n\nDescription\n\nPerform bitwise reversal of input.\n\nSemantics\n\nmsb = (.type==.b32) ? 31 : 63;\n\nfor (i=0; i<=msb; i++) {\n\n    d[i] = a[msb-i];\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nbrev requires sm_20 or higher.\n\nExamples\n\nbrev.b32  d, a;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev"
            };

        case "brkpt":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt\" target=\"_blank\" rel=\"noopener noreferrer\">brkpt <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Miscellaneous Instructions: brkpt</h1><section id=\"miscellaneous-instructions-brkpt\">\n\n\n<p>Breakpoint.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>brkpt;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Suspends execution.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">brkpt</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_11</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>    brkpt;\n@p  brkpt;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Breakpoint.\n\nSyntax\n\nbrkpt;\n\nDescription\n\nSuspends execution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nbrkpt requires sm_11 or higher.\n\nExamples\n\n    brkpt;\n\n@p  brkpt;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt"
            };

        case "brx":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx\" target=\"_blank\" rel=\"noopener noreferrer\">brx.idx <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Instructions: brx.idx</h1><section id=\"control-flow-instructions-brx-idx\">\n\n\n<p>Branch to a label indexed from a list of potential branch targets.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>@p    brx.idx{.uni} index, tlist;\n      brx.idx{.uni} index, tlist;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Index into a list of possible destination labels, and continue execution from the chosen\nlabel. Conditional branches are specified by using a guard predicate.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">brx.idx.uni</span></code> guarantees that the branch is non-divergent, i.e. all active threads in a warp that\nare currently executing this instruction have identical values for the guard predicate and the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">index</span></code> argument.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">index</span></code> operand is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> register. The <code class=\"docutils literal notranslate\"><span class=\"pre\">tlist</span></code> operand must be the label of a\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.branchtargets</span></code> directive. It is accessed as a zero-based sequence using <code class=\"docutils literal notranslate\"><span class=\"pre\">index</span></code>. Behaviour is\nundefined if the value of <code class=\"docutils literal notranslate\"><span class=\"pre\">index</span></code> is greater than or equal to the length of <code class=\"docutils literal notranslate\"><span class=\"pre\">tlist</span></code>.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.branchtargets</span></code> directive must be defined in the local function scope before it is used. It\nmust refer to labels within the current function.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (p) {\n    if (index &lt; length(tlist)) {\n      pc = tlist[index];\n    } else {\n      pc = undefined;\n    }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.function foo () {\n    .reg .u32 %r0;\n    ...\n    L1:\n    ...\n    L2:\n    ...\n    L3:\n    ...\n    ts: .branchtargets L1, L2, L3;\n    @p brx.idx %r0, ts;\n    ...\n}\n</pre></div>\n</div>\n</section>",
                "tooltip": "Branch to a label indexed from a list of potential branch targets.\n\nSyntax\n\n@p    brx.idx{.uni} index, tlist;\n\n      brx.idx{.uni} index, tlist;\n\nDescription\n\nIndex into a list of possible destination labels, and continue execution from the chosen\n\nlabel. Conditional branches are specified by using a guard predicate.\n\nbrx.idx.uni guarantees that the branch is non-divergent, i.e. all active threads in a warp that\n\nare currently executing this instruction have identical values for the guard predicate and the\n\nindex argument.\n\nThe index operand is a .u32 register. The tlist operand must be the label of a\n\n.branchtargets directive. It is accessed as a zero-based sequence using index. Behaviour is\n\nundefined if the value of index is greater than or equal to the length of tlist.\n\nThe .branchtargets directive must be defined in the local function scope before it is used. It\n\nmust refer to labels within the current function.\n\nSemantics\n\nif (p) {\n\n    if (index < length(tlist)) {\n\n      pc = tlist[index];\n\n    } else {\n\n      pc = undefined;\n\n    }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.0.\n\nTarget ISA Notes\n\nRequires sm_30 or higher.\n\nExamples\n\n.function foo () {\n\n    .reg .u32 %r0;\n\n    ...\n\n    L1:\n\n    ...\n\n    L2:\n\n    ...\n\n    L3:\n\n    ...\n\n    ts: .branchtargets L1, L2, L3;\n\n    @p brx.idx %r0, ts;\n\n    ...\n\n}\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx"
            };

        case "call":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call\" target=\"_blank\" rel=\"noopener noreferrer\">call <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Instructions: call</h1><section id=\"control-flow-instructions-call\">\n\n\n<p>Call a function, recording the return location.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// direct call to named function, func is a symbol\ncall{.uni} (ret-param), func, (param-list);\ncall{.uni} func, (param-list);\ncall{.uni} func;\n\n// indirect call via pointer, with full list of call targets\ncall{.uni} (ret-param), fptr, (param-list), flist;\ncall{.uni} fptr, (param-list), flist;\ncall{.uni} fptr, flist;\n\n// indirect call via pointer, with no knowledge of call targets\ncall{.uni} (ret-param), fptr, (param-list), fproto;\ncall{.uni} fptr, (param-list), fproto;\ncall{.uni} fptr, fproto;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> instruction stores the address of the next instruction, so execution can resume at that\npoint after executing a <code class=\"docutils literal notranslate\"><span class=\"pre\">ret</span></code> instruction. A <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> is assumed to be divergent unless the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.uni</span></code> suffix is present. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.uni</span></code> suffix indicates that the <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> is guaranteed to be\nnon-divergent, i.e. all active threads in a warp that are currently executing this instruction have\nidentical values for the guard predicate and <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> target.</p>\n<p>For direct calls, the called location <code class=\"docutils literal notranslate\"><span class=\"pre\">func</span></code> must be a symbolic function name; for indirect calls,\nthe called location <code class=\"docutils literal notranslate\"><span class=\"pre\">fptr</span></code> must be an address of a function held in a register. Input arguments\nand return values are optional.\u00a0Arguments may be registers, immediate constants, or variables in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space. Arguments are pass-by-value.</p>\n<p>Indirect calls require an additional operand, <code class=\"docutils literal notranslate\"><span class=\"pre\">flist</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">fproto</span></code>, to communicate the list of\npotential <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> targets or the common function prototype of all <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> targets,\nrespectively. In the first case, <code class=\"docutils literal notranslate\"><span class=\"pre\">flist</span></code> gives a complete list of potential <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> targets and\nthe optimizing backend is free to optimize the calling convention. In the second case, where the\ncomplete list of potential <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> targets may not be known, the common function prototype is given\nand the <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> must obey the ABI\u2019s calling convention.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">flist</span></code> operand is either the name of an array (call table) initialized to a list of function\nnames; or a label associated with a <code class=\"docutils literal notranslate\"><span class=\"pre\">.calltargets</span></code> directive, which declares a list of potential\n<code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> targets. In both cases the fptr register holds the address of a function listed in the call\ntable or <code class=\"docutils literal notranslate\"><span class=\"pre\">.calltargets</span></code> list, and the <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> operands are type-checked against the type\nsignature of the functions indicated by <code class=\"docutils literal notranslate\"><span class=\"pre\">flist</span></code>.</p>\n<p>The fproto operand is the name of a label associated with a <code class=\"docutils literal notranslate\"><span class=\"pre\">.callprototype</span></code> directive. This\noperand is used when a complete list of potential targets is not known. The <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> operands are\ntype-checked against the prototype, and code generation will follow the ABI calling convention. If a\nfunction that doesn\u2019t match the prototype is called, the behavior is undefined.</p>\n<p>Call tables may be declared at module scope or local scope, in either the constant or global state\nspace. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.calltargets</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.callprototype</span></code> directives must be declared within a function\nbody. All functions must be declared prior to being referenced in a <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> table initializer or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.calltargets</span></code> directive.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Direct <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> introduced in PTX ISA version 1.0. Indirect <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> introduced in PTX ISA version 2.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Direct <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> supported on all target architectures. Indirect <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// examples of direct call\n    call     init;    // call function 'init'\n    call.uni g, (a);  // call function 'g' with parameter 'a'\n@p  call     (d), h, (a, b);  // return value into register d\n\n// call-via-pointer using jump table\n.func (.reg .u32 rv) foo (.reg .u32 a, .reg .u32 b) ...\n.func (.reg .u32 rv) bar (.reg .u32 a, .reg .u32 b) ...\n.func (.reg .u32 rv) baz (.reg .u32 a, .reg .u32 b) ...\n\n.global .u32 jmptbl[5] = { foo, bar, baz };\n      ...\n@p    ld.global.u32  %r0, [jmptbl+4];\n@p    ld.global.u32  %r0, [jmptbl+8];\n      call  (retval), %r0, (x, y), jmptbl;\n\n// call-via-pointer using .calltargets directive\n.func (.reg .u32 rv) foo (.reg .u32 a, .reg .u32 b) ...\n.func (.reg .u32 rv) bar (.reg .u32 a, .reg .u32 b) ...\n.func (.reg .u32 rv) baz (.reg .u32 a, .reg .u32 b) ...\n      ...\n@p    mov.u32  %r0, foo;\n@q    mov.u32  %r0, baz;\nFtgt: .calltargets foo, bar, baz;\n      call  (retval), %r0, (x, y), Ftgt;\n\n// call-via-pointer using .callprototype directive\n.func dispatch (.reg .u32 fptr, .reg .u32 idx)\n{\n...\nFproto: .callprototype _ (.param .u32 _, .param .u32 _);\n      call  %fptr, (x, y), Fproto;\n...\n</pre></div>\n</div>\n</section>",
                "tooltip": "Call a function, recording the return location.\n\nSyntax\n\n// direct call to named function, func is a symbol\n\ncall{.uni} (ret-param), func, (param-list);\n\ncall{.uni} func, (param-list);\n\ncall{.uni} func;\n\n// indirect call via pointer, with full list of call targets\n\ncall{.uni} (ret-param), fptr, (param-list), flist;\n\ncall{.uni} fptr, (param-list), flist;\n\ncall{.uni} fptr, flist;\n\n// indirect call via pointer, with no knowledge of call targets\n\ncall{.uni} (ret-param), fptr, (param-list), fproto;\n\ncall{.uni} fptr, (param-list), fproto;\n\ncall{.uni} fptr, fproto;\n\nDescription\n\nThe call instruction stores the address of the next instruction, so execution can resume at that\n\npoint after executing a ret instruction. A call is assumed to be divergent unless the\n\n.uni suffix is present. The .uni suffix indicates that the call is guaranteed to be\n\nnon-divergent, i.e. all active threads in a warp that are currently executing this instruction have\n\nidentical values for the guard predicate and call target.\n\nFor direct calls, the called location func must be a symbolic function name; for indirect calls,\n\nthe called location fptr must be an address of a function held in a register. Input arguments\n\nand return values are optional.\u00a0Arguments may be registers, immediate constants, or variables in\n\n.param space. Arguments are pass-by-value.\n\nIndirect calls require an additional operand, flist or fproto, to communicate the list of\n\npotential call targets or the common function prototype of all call targets,\n\nrespectively. In the first case, flist gives a complete list of potential call targets and\n\nthe optimizing backend is free to optimize the calling convention. In the second case, where the\n\ncomplete list of potential call targets may not be known, the common function prototype is given\n\nand the call must obey the ABI\u2019s calling convention.\n\nThe flist operand is either the name of an array (call table) initialized to a list of function\n\nnames; or a label associated with a .calltargets directive, which declares a list of potential\n\ncall targets. In both cases the fptr register holds the address of a function listed in the call\n\ntable or .calltargets list, and the call operands are type-checked against the type\n\nsignature of the functions indicated by flist.\n\nThe fproto operand is the name of a label associated with a .callprototype directive. This\n\noperand is used when a complete list of potential targets is not known. The call operands are\n\ntype-checked against the prototype, and code generation will follow the ABI calling convention. If a\n\nfunction that doesn\u2019t match the prototype is called, the behavior is undefined.\n\nCall tables may be declared at module scope or local scope, in either the constant or global state\n\nspace. The .calltargets and .callprototype directives must be declared within a function\n\nbody. All functions must be declared prior to being referenced in a call table initializer or\n\n.calltargets directive.\n\nPTX ISA Notes\n\nDirect call introduced in PTX ISA version 1.0. Indirect call introduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nDirect call supported on all target architectures. Indirect call requires sm_20 or higher.\n\nExamples\n\n// examples of direct call\n\n    call     init;    // call function 'init'\n\n    call.uni g, (a);  // call function 'g' with parameter 'a'\n\n@p  call     (d), h, (a, b);  // return value into register d\n\n// call-via-pointer using jump table\n\n.func (.reg .u32 rv) foo (.reg .u32 a, .reg .u32 b) ...\n\n.func (.reg .u32 rv) bar (.reg .u32 a, .reg .u32 b) ...\n\n.func (.reg .u32 rv) baz (.reg .u32 a, .reg .u32 b) ...\n\n.global .u32 jmptbl[5] = { foo, bar, baz };\n\n      ...\n\n@p    ld.global.u32  %r0, [jmptbl+4];\n\n@p    ld.global.u32  %r0, [jmptbl+8];\n\n      call  (retval), %r0, (x, y), jmptbl;\n\n// call-via-pointer using .calltargets directive\n\n.func (.reg .u32 rv) foo (.reg .u32 a, .reg .u32 b) ...\n\n.func (.reg .u32 rv) bar (.reg .u32 a, .reg .u32 b) ...\n\n.func (.reg .u32 rv) baz (.reg .u32 a, .reg .u32 b) ...\n\n      ...\n\n@p    mov.u32  %r0, foo;\n\n@q    mov.u32  %r0, baz;\n\nFtgt: ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call"
            };

        case "callprototype":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-callprototype\" target=\"_blank\" rel=\"noopener noreferrer\">callprototype <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Directives: .callprototype</h1><section id=\"control-flow-directives-callprototype\">\n\n\n<p>Declare a prototype for use in an indirect call.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span> // no input or return parameters\nlabel: .callprototype _ .noreturn;\n// input params, no return params\nlabel: .callprototype _ (param-list) .noreturn;\n// no input params, // return params\nlabel: .callprototype (ret-param) _ ;\n// input, return parameters\nlabel: .callprototype (ret-param) _ (param-list);\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Defines a prototype with no specific function name, and associates the prototype with a label. The\nprototype may then be used in indirect call instructions where there is incomplete knowledge of the\npossible call targets.</p>\n<p>Parameters may have either base types in the register or parameter state spaces, or array types in\nparameter state space. The sink symbol <code class=\"docutils literal notranslate\"><span class=\"pre\">'_'</span></code> may be used to avoid dummy parameter names.</p>\n<p>An optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive indicates that the function does not return to the caller\nfunction. <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive cannot be specified on functions which have return parameters. See\nthe description of .noreturn directive in <a class=\"reference external\" href=\"#performance-tuning-directives-noreturn\">Performance-Tuning Directives: .noreturn</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.1.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive introduced in PTX ISA version 6.4.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>Fproto1: .callprototype  _ ;\nFproto2: .callprototype  _ (.param .f32 _);\nFproto3: .callprototype  (.param .u32 _) _ ;\nFproto4: .callprototype  (.param .u32 _) _ (.param .f32 _);\n...\n@p   call  (%val), %r0, (%f1), Fproto4;\n...\n\n// example of array parameter\nFproto5: .callprototype _ (.param .b8 _[12]);\n\nFproto6: .callprototype  _ (.param .f32 _) .noreturn;\n...\n@p   call  %r0, (%f1), Fproto6;\n...\n</pre></div>\n</div>\n</section>",
                "tooltip": "Declare a prototype for use in an indirect call.\n\nSyntax\n\n // no input or return parameters\n\nlabel: .callprototype _ .noreturn;\n\n// input params, no return params\n\nlabel: .callprototype _ (param-list) .noreturn;\n\n// no input params, // return params\n\nlabel: .callprototype (ret-param) _ ;\n\n// input, return parameters\n\nlabel: .callprototype (ret-param) _ (param-list);\n\nDescription\n\nDefines a prototype with no specific function name, and associates the prototype with a label. The\n\nprototype may then be used in indirect call instructions where there is incomplete knowledge of the\n\npossible call targets.\n\nParameters may have either base types in the register or parameter state spaces, or array types in\n\nparameter state space. The sink symbol '_' may be used to avoid dummy parameter names.\n\nAn optional .noreturn directive indicates that the function does not return to the caller\n\nfunction. .noreturn directive cannot be specified on functions which have return parameters. See\n\nthe description of .noreturn directive in Performance-Tuning Directives: .noreturn.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nSupport for .noreturn directive introduced in PTX ISA version 6.4.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\n.noreturn directive requires sm_30 or higher.\n\nExamples\n\nFproto1: .callprototype  _ ;\n\nFproto2: .callprototype  _ (.param .f32 _);\n\nFproto3: .callprototype  (.param .u32 _) _ ;\n\nFproto4: .callprototype  (.param .u32 _) _ (.param .f32 _);\n\n...\n\n@p   call  (%val), %r0, (%f1), Fproto4;\n\n...\n\n// example of array parameter\n\nFproto5: .callprototype _ (.param .b8 _[12]);\n\nFproto6: .callprototype  _ (.param .f32 _) .noreturn;\n\n...\n\n@p   call  %r0, (%f1), Fproto6;\n\n...\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-callprototype"
            };

        case "calltargets":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-calltargets\" target=\"_blank\" rel=\"noopener noreferrer\">calltargets <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Directives: .calltargets</h1><section id=\"control-flow-directives-calltargets\">\n\n\n<p>Declare a list of potential call targets.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>Label:   .calltargets  list-of-functions ;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares a list of potential call targets for a subsequent indirect call, and associates the list\nwith the label at the start of the line.</p>\n<p>All functions named in the list must be declared prior to the <code class=\"docutils literal notranslate\"><span class=\"pre\">.calltargets</span></code> directive, and all\nfunctions must have the same type signature.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>calltgt:  .calltargets  fastsin, fastcos;\n...\n@p   call  (%f1), %r0, (%x), calltgt;\n...\n</pre></div>\n</div>\n</section>",
                "tooltip": "Declare a list of potential call targets.\n\nSyntax\n\nLabel:   .calltargets  list-of-functions ;\n\nDescription\n\nDeclares a list of potential call targets for a subsequent indirect call, and associates the list\n\nwith the label at the start of the line.\n\nAll functions named in the list must be declared prior to the .calltargets directive, and all\n\nfunctions must have the same type signature.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\ncalltgt:  .calltargets  fastsin, fastcos;\n\n...\n\n@p   call  (%f1), %r0, (%x), calltgt;\n\n...\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-calltargets"
            };

        case "clock":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock-clock-hi\" target=\"_blank\" rel=\"noopener noreferrer\">clock <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %clock, %clock_hi</h1><section id=\"special-registers-clock-clock-hi\">\n<span id=\"special-registers-clock\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code></dt>\n<dd>\n<p>A predefined, read-only 32-bit unsigned cycle counter.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code></dt>\n<dd>\n<p>The upper 32-bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock64</span></code> special register.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %clock;\n.sreg .u32 %clock_hi;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special register <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code> are unsigned 32-bit read-only cycle counters that wrap\nsilently.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code> introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code> introduced in PTX ISA version 5.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32 r1,%clock;\nmov.u32 r2, %clock_hi;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%clock\n\nA predefined, read-only 32-bit unsigned cycle counter.\n\n%clock_hi\n\nThe upper 32-bits of %clock64 special register.\n\nSyntax (predefined)\n\n.sreg .u32 %clock;\n\n.sreg .u32 %clock_hi;\n\nDescription\n\nSpecial register %clock and %clock_hi are unsigned 32-bit read-only cycle counters that wrap\n\nsilently.\n\nPTX ISA Notes\n\n%clock introduced in PTX ISA version 1.0.\n\n%clock_hi introduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\n%clock supported on all target architectures.\n\n%clock_hi requires sm_20 or higher.\n\nExamples\n\nmov.u32 r1,%clock;\n\nmov.u32 r2, %clock_hi;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock-clock-hi"
            };

        case "clock64":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock64\" target=\"_blank\" rel=\"noopener noreferrer\">clock64 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %clock64</h1><section id=\"special-registers-clock64\">\n\n\n<p>A predefined, read-only 64-bit unsigned cycle counter.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u64 %clock64;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special register <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock64</span></code> is an unsigned 64-bit read-only cycle counter that wraps silently.</p>\n<p><strong>Notes</strong></p>\n<p>The lower 32-bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock64</span></code> are identical to <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code>.</p>\n<p>The upper 32-bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock64</span></code> are identical to <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u64  r1,%clock64;\n</pre></div>\n</div>\n</section>",
                "tooltip": "A predefined, read-only 64-bit unsigned cycle counter.\n\nSyntax (predefined)\n\n.sreg .u64 %clock64;\n\nDescription\n\nSpecial register %clock64 is an unsigned 64-bit read-only cycle counter that wraps silently.\n\nNotes\n\nThe lower 32-bits of %clock64 are identical to %clock.\n\nThe upper 32-bits of %clock64 are identical to %clock_hi.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%clock64 requires sm_20 or higher.\n\nExamples\n\nmov.u64  r1,%clock64;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock64"
            };

        case "clock_hi":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock-clock-hi\" target=\"_blank\" rel=\"noopener noreferrer\">clock_hi <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %clock, %clock_hi</h1><section id=\"special-registers-clock-clock-hi\">\n<span id=\"special-registers-clock\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code></dt>\n<dd>\n<p>A predefined, read-only 32-bit unsigned cycle counter.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code></dt>\n<dd>\n<p>The upper 32-bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock64</span></code> special register.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %clock;\n.sreg .u32 %clock_hi;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special register <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code> are unsigned 32-bit read-only cycle counters that wrap\nsilently.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code> introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code> introduced in PTX ISA version 5.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32 r1,%clock;\nmov.u32 r2, %clock_hi;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%clock\n\nA predefined, read-only 32-bit unsigned cycle counter.\n\n%clock_hi\n\nThe upper 32-bits of %clock64 special register.\n\nSyntax (predefined)\n\n.sreg .u32 %clock;\n\n.sreg .u32 %clock_hi;\n\nDescription\n\nSpecial register %clock and %clock_hi are unsigned 32-bit read-only cycle counters that wrap\n\nsilently.\n\nPTX ISA Notes\n\n%clock introduced in PTX ISA version 1.0.\n\n%clock_hi introduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\n%clock supported on all target architectures.\n\n%clock_hi requires sm_20 or higher.\n\nExamples\n\nmov.u32 r1,%clock;\n\nmov.u32 r2, %clock_hi;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock-clock-hi"
            };

        case "cluster_ctaid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctaid\" target=\"_blank\" rel=\"noopener noreferrer\">cluster_ctaid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %cluster_ctaid</h1><section id=\"special-registers-cluster-ctaid\">\n\n\n<p>CTA identifier within a cluster.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %cluster_ctaid;\n.sreg .u32 %cluster_ctaid.x, %cluster_ctaid.y, %cluster_ctaid.z;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the CTA identifier in a cluster in each\ndimension. Each CTA in a cluster has a unique CTA identifier.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">%cluster_ctaid</span></code> special register contains a 1D, 2D, or 3D vector, depending upon the shape of\nthe cluster. The fourth element is unused and always returns zero.</p>\n<p>It is guaranteed that:</p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>0  &lt;=  %cluster_ctaid.x &lt;  %cluster_nctaid.x\n0  &lt;=  %cluster_ctaid.y &lt;  %cluster_nctaid.y\n0  &lt;=  %cluster_ctaid.z &lt;  %cluster_nctaid.z\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b32 %r&lt;2&gt;;\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %cluster_ctaid.x;\nmov.u32     %r1, %cluster_ctaid.z;\nmov.v4.u32  %rx, %cluster_ctaid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "CTA identifier within a cluster.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %cluster_ctaid;\n\n.sreg .u32 %cluster_ctaid.x, %cluster_ctaid.y, %cluster_ctaid.z;\n\nDescription\n\nA predefined, read-only special register initialized with the CTA identifier in a cluster in each\n\ndimension. Each CTA in a cluster has a unique CTA identifier.\n\nThe %cluster_ctaid special register contains a 1D, 2D, or 3D vector, depending upon the shape of\n\nthe cluster. The fourth element is unused and always returns zero.\n\nIt is guaranteed that:\n\n0  <=  %cluster_ctaid.x <  %cluster_nctaid.x\n\n0  <=  %cluster_ctaid.y <  %cluster_nctaid.y\n\n0  <=  %cluster_ctaid.z <  %cluster_nctaid.z\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r<2>;\n\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %cluster_ctaid.x;\n\nmov.u32     %r1, %cluster_ctaid.z;\n\nmov.v4.u32  %rx, %cluster_ctaid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctaid"
            };

        case "cluster_ctarank":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctarank\" target=\"_blank\" rel=\"noopener noreferrer\">cluster_ctarank <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %cluster_ctarank</h1><section id=\"special-registers-cluster-ctarank\">\n\n\n<p>CTA identifier in a cluster across all dimensions.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %cluster_ctarank;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the CTA rank within a cluster across all\ndimensions.</p>\n<p>It is guaranteed that:</p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>0  &lt;=  %cluster_ctarank &lt;  %cluster_nctarank\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b32 %r;\n\nmov.u32  %r, %cluster_ctarank;\n</pre></div>\n</div>\n</section>",
                "tooltip": "CTA identifier in a cluster across all dimensions.\n\nSyntax (predefined)\n\n.sreg .u32 %cluster_ctarank;\n\nDescription\n\nA predefined, read-only special register initialized with the CTA rank within a cluster across all\n\ndimensions.\n\nIt is guaranteed that:\n\n0  <=  %cluster_ctarank <  %cluster_nctarank\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r;\n\nmov.u32  %r, %cluster_ctarank;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctarank"
            };

        case "cluster_nctaid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctaid\" target=\"_blank\" rel=\"noopener noreferrer\">cluster_nctaid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %cluster_nctaid</h1><section id=\"special-registers-cluster-nctaid\">\n\n\n<p>Number of CTA identifiers per cluster.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %cluster_nctaid;\n.sreg .u32 %cluster_nctaid.x, %cluster_nctaid.y, %cluster_nctaid.z;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the number of CTAs in a cluster in each\ndimension.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">%cluster_nctaid</span></code> special register contains a 3D grid shape vector that holds the cluster\ndimensions in terms of CTAs. The fourth element is unused and always returns zero.</p>\n<p>Refer to the <em>Cuda Programming Guide</em> for details on the maximum values of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%cluster_nctaid.{x,y,z}</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b32 %r&lt;2&gt;;\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %cluster_nctaid.x;\nmov.u32     %r1, %cluster_nctaid.z;\nmov.v4.u32  %rx, %cluster_nctaid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of CTA identifiers per cluster.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %cluster_nctaid;\n\n.sreg .u32 %cluster_nctaid.x, %cluster_nctaid.y, %cluster_nctaid.z;\n\nDescription\n\nA predefined, read-only special register initialized with the number of CTAs in a cluster in each\n\ndimension.\n\nThe %cluster_nctaid special register contains a 3D grid shape vector that holds the cluster\n\ndimensions in terms of CTAs. The fourth element is unused and always returns zero.\n\nRefer to the Cuda Programming Guide for details on the maximum values of\n\n%cluster_nctaid.{x,y,z}.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r<2>;\n\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %cluster_nctaid.x;\n\nmov.u32     %r1, %cluster_nctaid.z;\n\nmov.v4.u32  %rx, %cluster_nctaid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctaid"
            };

        case "cluster_nctarank":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctarank\" target=\"_blank\" rel=\"noopener noreferrer\">cluster_nctarank <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %cluster_nctarank</h1><section id=\"special-registers-cluster-nctarank\">\n\n\n<p>Number of CTA identifiers in a cluster across all dimensions.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %cluster_nctarank;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the nunber of CTAs within a cluster across\nall dimensions.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b32 %r;\n\nmov.u32  %r, %cluster_nctarank;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of CTA identifiers in a cluster across all dimensions.\n\nSyntax (predefined)\n\n.sreg .u32 %cluster_nctarank;\n\nDescription\n\nA predefined, read-only special register initialized with the nunber of CTAs within a cluster across\n\nall dimensions.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r;\n\nmov.u32  %r, %cluster_nctarank;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctarank"
            };

        case "clusterid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clusterid\" target=\"_blank\" rel=\"noopener noreferrer\">clusterid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %clusterid</h1><section id=\"special-registers-clusterid\">\n\n\n<p>Cluster identifier within a grid.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %clusterid;\n.sreg .u32 %clusterid.x, %clusterid.y, %clusterid.z;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the cluster identifier in a grid in each\ndimension. Each cluster in a grid has a unique identifier.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">%clusterid</span></code> special register contains a 1D, 2D, or 3D vector, depending upon the shape and\nrank of the cluster. The fourth element is unused and always returns zero.</p>\n<p>It is guaranteed that:</p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>0  &lt;=  %clusterid.x &lt;  %nclusterid.x\n0  &lt;=  %clusterid.y &lt;  %nclusterid.y\n0  &lt;=  %clusterid.z &lt;  %nclusterid.z\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b32 %r&lt;2&gt;;\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %clusterid.x;\nmov.u32     %r1, %clusterid.z;\nmov.v4.u32  %rx, %clusterid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Cluster identifier within a grid.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %clusterid;\n\n.sreg .u32 %clusterid.x, %clusterid.y, %clusterid.z;\n\nDescription\n\nA predefined, read-only special register initialized with the cluster identifier in a grid in each\n\ndimension. Each cluster in a grid has a unique identifier.\n\nThe %clusterid special register contains a 1D, 2D, or 3D vector, depending upon the shape and\n\nrank of the cluster. The fourth element is unused and always returns zero.\n\nIt is guaranteed that:\n\n0  <=  %clusterid.x <  %nclusterid.x\n\n0  <=  %clusterid.y <  %nclusterid.y\n\n0  <=  %clusterid.z <  %nclusterid.z\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r<2>;\n\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %clusterid.x;\n\nmov.u32     %r1, %clusterid.z;\n\nmov.v4.u32  %rx, %clusterid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clusterid"
            };

        case "clz":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz\" target=\"_blank\" rel=\"noopener noreferrer\">clz(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: clz</h1><section id=\"integer-arithmetic-instructions-clz\">\n\n\n<p>Count leading zeros.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>clz.type  d, a;\n\n.type = { .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Count the number of leading zeros in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> starting with the most-significant bit and place the\nresult in 32-bit destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.\u00a0Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> has the instruction type, and destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type, the number of leading zeros is between 0 and 32,\ninclusively. For<code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code> type, the number of leading zeros is between 0 and 64, inclusively.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.u32  d = 0;\nif (.type == .b32)   { max = 32; mask = 0x80000000; }\nelse                 { max = 64; mask = 0x8000000000000000; }\n\nwhile (d &lt; max &amp;&amp; (a&amp;mask == 0) ) {\n    d++;\n    a = a &lt;&lt; 1;\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">clz</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>clz.b32  d, a;\nclz.b64  cnt, X;  // cnt is .u32\n</pre></div>\n</div>\n</section>",
                "tooltip": "Count leading zeros.\n\nSyntax\n\nclz.type  d, a;\n\n.type = { .b32, .b64 };\n\nDescription\n\nCount the number of leading zeros in a starting with the most-significant bit and place the\n\nresult in 32-bit destination register d.\u00a0Operand a has the instruction type, and destination\n\nd has type .u32. For .b32 type, the number of leading zeros is between 0 and 32,\n\ninclusively. For.b64 type, the number of leading zeros is between 0 and 64, inclusively.\n\nSemantics\n\n.u32  d = 0;\n\nif (.type == .b32)   { max = 32; mask = 0x80000000; }\n\nelse                 { max = 64; mask = 0x8000000000000000; }\n\nwhile (d < max && (a&mask == 0) ) {\n\n    d++;\n\n    a = a << 1;\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nclz requires sm_20 or higher.\n\nExamples\n\nclz.b32  d, a;\n\nclz.b64  cnt, X;  // cnt is .u32\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz"
            };

        case "cnot":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot\" target=\"_blank\" rel=\"noopener noreferrer\">cnot <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: cnot</h1><section id=\"logic-and-shift-instructions-cnot\">\n\n\n<p>C/C++ style logical negation.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cnot.type d, a;\n\n.type = { .b16, .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the logical negation using C/C++ semantics.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = (a==0) ? 1 : 0;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The size of the operands must match, but not necessarily the type.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cnot.b32 d,a;\n</pre></div>\n</div>\n</section>",
                "tooltip": "C/C++ style logical negation.\n\nSyntax\n\ncnot.type d, a;\n\n.type = { .b16, .b32, .b64 };\n\nDescription\n\nCompute the logical negation using C/C++ semantics.\n\nSemantics\n\nd = (a==0) ? 1 : 0;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\ncnot.b32 d,a;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot"
            };

        case "common":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-common\" target=\"_blank\" rel=\"noopener noreferrer\">common <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Linking Directives: .common</h1><section id=\"linking-directives-common\">\n\n\n<p>Visible (externally) symbol declaration.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.common identifier\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares identifier to be globally visible but \u201ccommon\u201d.</p>\n<p>Common symbols are similar to globally visible symbols. However multiple object files may declare\nthe same common symbol and they may have different types and sizes and references to a symbol get\nresolved against a common symbol with the largest size.</p>\n<p>Only one object file can initialize a common symbol and that must have the largest size among all\nother definitions of that common symbol from different object files.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.common</span></code> linking directive can be used only on variables with <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> storage. It cannot be\nused on function symbols or on symbols with opaque type.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 5.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.common</span></code> directive requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.common .global .u32 gbl;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Visible (externally) symbol declaration.\n\nSyntax\n\n.common identifier\n\nDescription\n\nDeclares identifier to be globally visible but \u201ccommon\u201d.\n\nCommon symbols are similar to globally visible symbols. However multiple object files may declare\n\nthe same common symbol and they may have different types and sizes and references to a symbol get\n\nresolved against a common symbol with the largest size.\n\nOnly one object file can initialize a common symbol and that must have the largest size among all\n\nother definitions of that common symbol from different object files.\n\n.common linking directive can be used only on variables with .global storage. It cannot be\n\nused on function symbols or on symbols with opaque type.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\n.common directive requires sm_20 or higher.\n\nExamples\n\n.common .global .u32 gbl;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-common"
            };

        case "copysign":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign\" target=\"_blank\" rel=\"noopener noreferrer\">copysign(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: copysign</h1><section id=\"floating-point-instructions-copysign\">\n\n\n<p>Copy sign of one input to another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>copysign.type  d, a, b;\n\n.type = { .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Copy sign bit of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> into value of <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and return the result as <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>copysign.f32  x, y, z;\ncopysign.f64  A, B, C;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Copy sign of one input to another.\n\nSyntax\n\ncopysign.type  d, a, b;\n\n.type = { .f32, .f64 };\n\nDescription\n\nCopy sign bit of a into value of b, and return the result as d.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\ncopysign.f32  x, y, z;\n\ncopysign.f64  A, B, C;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign"
            };

        case "cos":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos\" target=\"_blank\" rel=\"noopener noreferrer\">cos(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: cos</h1><section id=\"floating-point-instructions-cos\">\n\n\n<p>Find the cosine of a value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cos.approx{.ftz}.f32  d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Find the cosine of the angle <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> (in radians).</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = cos(a);\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cos.approx.f32</span></code> implements a fast approximation to cosine.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 57%\"/>\n<col style=\"width: 43%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p>-Inf</p></td>\n<td><p>NaN</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>-subnormal</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>-0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>+0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>+subnormal</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>+Inf</p></td>\n<td><p>NaN</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>The maximum absolute error is 2<sup>-20.9</sup> in quadrant 00.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cos.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p>Subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cos.f32</span></code> introduced in PTX ISA version 1.0. Explicit modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code>\nintroduced in PTX ISA version 1.4.</p>\n<p>For PTX ISA version 1.4 and later, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> modifier is required.</p>\n<p>For PTX ISA versions 1.0 through 1.3, <code class=\"docutils literal notranslate\"><span class=\"pre\">cos.f32</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">cos.approx.ftz.f32</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cos.approx.ftz.f32  ca, a;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Find the cosine of a value.\n\nSyntax\n\ncos.approx{.ftz}.f32  d, a;\n\nDescription\n\nFind the cosine of the angle a (in radians).\n\nSemantics\n\nd = cos(a);\n\nNotes\n\ncos.approx.f32 implements a fast approximation to cosine.\n\n\n\nInput\n\nResult\n\n\n\n-Inf\n\nNaN\n\n-subnormal\n\n+1.0\n\n-0.0\n\n+1.0\n\n+0.0\n\n+1.0\n\n+subnormal\n\n+1.0\n\n+Inf\n\nNaN\n\nNaN\n\nNaN\n\nThe maximum absolute error is 2-20.9 in quadrant 00.\n\nSubnormal numbers:\n\nsm_20+\n\nBy default, subnormal numbers are supported.\n\ncos.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1x\n\nSubnormal inputs and results to sign-preserving zero.\n\nPTX ISA Notes\n\ncos.f32 introduced in PTX ISA version 1.0. Explicit modifiers .approx and .ftz\n\nintroduced in PTX ISA version 1.4.\n\nFor PTX ISA version 1.4 and later, the .approx modifier is required.\n\nFor PTX ISA versions 1.0 through 1.3, cos.f32 defaults to cos.approx.ftz.f32.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\ncos.approx.ftz.f32  ca, a;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos"
            };

        case "cp":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.bulk <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.bulk.commit_group <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.bulk.prefetch <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.bulk.prefetch.tensor <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.bulk.tensor <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.bulk.wait_group <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.commit_group <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.mbarrier.arrive <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.wait_all <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.wait_group <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk\" target=\"_blank\" rel=\"noopener noreferrer\">cp.reduce.async.bulk <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor\" target=\"_blank\" rel=\"noopener noreferrer\">cp.reduce.async.bulk.tensor <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: cp.async</h1><section id=\"data-movement-and-conversion-instructions-cp-async\">\n\n\n<p>Initiates an asynchronous copy operation from one state space to another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.ca.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n                         [dst], [src], cp-size{, src-size}{, cache-policy} ;\ncp.async.cg.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n                         [dst], [src], 16{, src-size}{, cache-policy} ;\ncp.async.ca.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n                         [dst], [src], cp-size{, ignore-src}{, cache-policy} ;\ncp.async.cg.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n                         [dst], [src], 16{, ignore-src}{, cache-policy} ;\n\n.level::cache_hint =     { .L2::cache_hint }\n.level::prefetch_size =  { .L2::64B, .L2::128B, .L2::256B }\ncp-size =                { 4, 8, 16 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> is a non-blocking instruction which initiates an asynchronous copy operation of data\nfrom the location specified by source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">src</span></code> to the location specified by\ndestination address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">src</span></code> specifies a location in the global state space\nand <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code> specifies a location in the shared state space.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cp-size</span></code> is an integer constant which specifies the size of data in bytes to be copied to\nthe destination <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code>. <code class=\"docutils literal notranslate\"><span class=\"pre\">cp-size</span></code> can only be 4, 8 and 16.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> allows optionally specifying a 32-bit integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">src-size</span></code>. Operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">src-size</span></code> represents the size of the data in bytes to be copied from <code class=\"docutils literal notranslate\"><span class=\"pre\">src</span></code> to <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code> and must\nbe less than <code class=\"docutils literal notranslate\"><span class=\"pre\">cp-size</span></code>. In such case, remaining bytes in destination <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code> are filled with\nzeros. Specifying <code class=\"docutils literal notranslate\"><span class=\"pre\">src-size</span></code> larger than <code class=\"docutils literal notranslate\"><span class=\"pre\">cp-size</span></code> results in undefined behavior.</p>\n<p>The optional and non-immediate predicate argument <code class=\"docutils literal notranslate\"><span class=\"pre\">ignore-src</span></code> specifies whether the data from the\nsource location <code class=\"docutils literal notranslate\"><span class=\"pre\">src</span></code> should be ignored completely. If the source data is ignored then zeros will\nbe copied to destination <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code>. If the argument <code class=\"docutils literal notranslate\"><span class=\"pre\">ignore-src</span></code> is not specified then it defaults\nto <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>.</p>\n<p>Supported alignment requirements and addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">src</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code> are described\nin <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>.</p>\n<p>The mandatory <code class=\"docutils literal notranslate\"><span class=\"pre\">.async</span></code> qualifier indicates that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp</span></code> instruction will initiate the memory\ncopy operation asynchronously and control will return to the executing thread before the copy\noperation is complete. The executing thread can then use <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> or <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier\">mbarrier instructions</a> to wait for\ncompletion of the asynchronous copy operation. No other synchronization mechanisms described in\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a> can be used to guarantee the\ncompletion of the asynchronous copy operations.</p>\n<p>There is no ordering guarantee between two <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations if they are not explicitly\nsynchronized using <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> or <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier\">mbarrier instructions</a>.</p>\n<p>As described in <a class=\"reference external\" href=\"#cache-operators\">Cache Operators</a>, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.cg</span></code> qualifier indicates\ncaching of data only at global level cache L2 and not at L1 whereas <code class=\"docutils literal notranslate\"><span class=\"pre\">.ca</span></code> qualifier indicates\ncaching of data at all levels including L1 cache. Cache operator are treated as performance hints\nonly.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> is treated as a weak memory operation in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifier is a hint to fetch additional data of the specified size\ninto the respective cache level.The sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch_size</span></code> can be set to either of <code class=\"docutils literal notranslate\"><span class=\"pre\">64B</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">128B</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">256B</span></code> thereby allowing the prefetch size to be 64 Bytes, 128 Bytes or 256 Bytes\nrespectively.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> may only be used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space and with\ngeneric addressing where the address points to <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space. If the generic address does\nnot fall within the address window of the global memory, then the prefetching behavior is undefined.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifier is treated as a performance hint only.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space and for generic\naddressing where the address points to the <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifiers introduced in PTX ISA\nversion 7.4.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">ignore-src</span></code> operand introduced in PTX ISA version 7.5.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.ca.shared.global  [shrd],    [gbl + 4], 4;\ncp.async.ca.shared::cta.global  [%r0 + 8], [%r1],     8;\ncp.async.cg.shared.global  [%r2],     [%r3],     16;\n\ncp.async.cg.shared.global.L2::64B   [%r2],      [%r3],     16;\ncp.async.cg.shared.global.L2::128B  [%r0 + 16], [%r1],     16;\ncp.async.cg.shared.global.L2::256B  [%r2 + 32], [%r3],     16;\n\ncreatepolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 cache-policy, 0.25;\ncp.async.ca.shared.global.L2::cache_hint [%r2], [%r1], 4, cache-policy;\n\ncp.async.ca.shared.global                   [shrd], [gbl], 4, p;\ncp.async.cg.shared.global.L2::cache_hint   [%r0], [%r2], 16, q, cache-policy;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.bulk</h1><section id=\"data-movement-and-conversion-instructions-cp-async-bulk\">\n\n\n<p>Initiates an asynchronous copy operation from one state space to another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.bulk.dst.src.completion_mechanism{.multicast}{.level::cache_hint}\n                      [dstMem], [srcMem], size, [mbar] {, ctaMask} {, cache-policy}\n\n.dst =                  { .shared::cluster }\n.src =                  { .global }\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n.level::cache_hint =    { .L2::cache_hint }\n.multicast =            { .multicast::cluster  }\n\n\ncp.async.bulk.dst.src.completion_mechanism [dstMem], [srcMem], size, [mbar]\n\n.dst =                  { .shared::cluster }\n.src =                  { .shared::cta }\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n\n\ncp.async.bulk.dst.src.completion_mechanism{.level::cache_hint} [dstMem], [srcMem], size {, cache-policy}\n\n.dst =                  { .global }\n.src =                  { .shared::cta }\n.completion_mechanism = { .bulk_group }\n.level::cache_hint =    { .L2::cache_hint }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> is a non-blocking instruction which initiates an asynchronous bulk-copy operation\nfrom the location specified by source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> to the location specified by\ndestination address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code>.</p>\n<p>The direction of bulk-copy is from the state space specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> modifier to the state\nspace specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code> modifiers.</p>\n<p>The 32-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> specifies the amount of memory to be copied, in terms of number of\nbytes. <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> must be a multiple of 16. If the value is not a multiple of 16, then the behavior is\nundefined. The memory range <code class=\"docutils literal notranslate\"><span class=\"pre\">[dstMem,</span> <span class=\"pre\">dstMem</span> <span class=\"pre\">+</span> <span class=\"pre\">size</span> <span class=\"pre\">-</span> <span class=\"pre\">1]</span></code> must not overflow the destination memory\nspace and the memory range <code class=\"docutils literal notranslate\"><span class=\"pre\">[srcMem,</span> <span class=\"pre\">srcMem</span> <span class=\"pre\">+</span> <span class=\"pre\">size</span> <span class=\"pre\">-</span> <span class=\"pre\">1]</span></code> must not overflow the source memory\nspace. Otherwise, the behavior is undefined. The addresses <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> must be aligned\nto 16 bytes.</p>\n<p>When the source of the copy is <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> and the destination is <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code>, the\ndestination has to be in the shared memory of a different CTA within the cluster.</p>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.completion_mechanism</span></code> specifies the completion mechanism that is supported on the\ninstruction variant. The completion mechanisms that are supported for different variants are\nsummarized in the following table:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 20%\"/>\n<col style=\"width: 20%\"/>\n<col style=\"width: 17%\"/>\n<col style=\"width: 43%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>Completion mechanism</p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code></p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td rowspan=\"2\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier::...</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td rowspan=\"2\"><p>mbarrier based completion mechanism</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code></p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code></p></td>\n<td><p><em>Bulk async-group</em> based completion mechanism</p></td>\n</tr>\n</tbody>\n</table>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier::complete_tx::bytes</span></code> specifies that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> variant uses\nmbarrier based completion mechanism. The <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation, with <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument equal to amount of data copied in bytes, will be\nperformed on the mbarrier object specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>.</p>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code> specifies that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> variant uses <em>bulk async-group</em>\nbased completion mechanism.</p>\n<p>The optional modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.multicast::cluster</span></code> allows copying of data from global memory to shared\nmemory of multiple CTAs in the cluster. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">ctaMask</span></code> specifies the destination CTAs in the\ncluster such that each bit position in the 16-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">ctaMask</span></code> operand corresponds to the <code class=\"docutils literal notranslate\"><span class=\"pre\">%ctaid</span></code>\nof the destination CTA. The source data is multicast to the same CTA-relative offset as <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code>\nin the shared memory of each destination CTA. The mbarrier signal is also multicast to the same\nCTA-relative offset as <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code> in the shared memory of the destination CTA.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program. The\nqualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported when at least one of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code>\nstatespaces is <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p>The copy operation in <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> is treated as a weak memory operation and the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the mbarrier has <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> semantics at the <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope as described in the\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.multicast::cluster</span></code> qualifier is optimized for target architecture <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90a</span></code> and may have\nsubstantially reduced performance on other targets and hence <code class=\"docutils literal notranslate\"><span class=\"pre\">.multicast::cluster</span></code> is advised to\nbe used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span></code> <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90a</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.multicast::cluster</span></code> qualifier advised to be used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span></code> <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90a</span></code>.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// .global -&gt; .shared::cluster:\ncp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [mbar];\n\ncp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster\n                                             [dstMem], [srcMem], size, [mbar], ctaMask;\n\ncp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint\n                                             [dstMem], [srcMem], size, [mbar], cache-policy;\n\n\n// .shared::cta -&gt; .shared::cluster (strictly remote):\ncp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [mbar];\n\n// .shared::cta -&gt; .global:\ncp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size;\n\ncp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint} [dstMem], [srcMem], size, cache-policy;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.bulk.commit_group</h1><section id=\"data-movement-and-conversion-instructions-cp-async-bulk-commit-group\">\n\n\n<p>Commits all prior initiated but uncommitted <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> instructions into a\n<em>cp.async.bulk-group</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.bulk.commit_group;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.commit_group</span></code> instruction creates a new per-thread <em>bulk async-group</em> and batches\nall prior <code class=\"docutils literal notranslate\"><span class=\"pre\">cp{.reduce}.async.bulk.{.prefetch}{.tensor}</span></code> instructions satisfying the following\nconditions into the new <em>bulk async-group</em>:</p>\n<ul class=\"simple\">\n<li><p>The prior <code class=\"docutils literal notranslate\"><span class=\"pre\">cp{.reduce}.async.bulk.{.prefetch}{.tensor}</span></code> instructions use <em>bulk_group</em> based\ncompletion mechanism, and</p></li>\n<li><p>They are initiated by the executing thread but not committed to any <em>bulk async-group</em>.</p></li>\n</ul>\n<p>If there are no uncommitted <code class=\"docutils literal notranslate\"><span class=\"pre\">cp{.reduce}.async.bulk.{.prefetch}{.tensor}</span></code> instructions then\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.commit_group</span></code> results in an empty <em>bulk async-group</em>.</p>\n<p>An executing thread can wait for the completion of all\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp{.reduce}.async.bulk.{.prefetch}{.tensor}</span></code> operations in a <em>bulk async-group</em> using\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code>.</p>\n<p>There is no memory ordering guarantee provided between any two\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp{.reduce}.async.bulk.{.prefetch}{.tensor}</span></code> operations within the same <em>bulk async-group</em>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.bulk.commit_group;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.bulk.prefetch</h1><section id=\"data-movement-and-conversion-instructions-cp-async-bulk-prefetch\">\n\n\n<p>Provides a hint to the system to initiate the asynchronous prefetch of data to the cache.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.bulk.prefetch.L2.src{.level::cache_hint}   [srcMem], size {, cache-policy}\n\n.src =                { .global }\n.level::cache_hint =  { .L2::cache_hint }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.prefetch</span></code> is a non-blocking instruction which may initiate an asynchronous prefetch\nof data from the location specified by source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code>, in <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> statespace, to\nthe L2 cache.</p>\n<p>The 32-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> specifies the amount of memory to be prefetched in terms of number of\nbytes. <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> must be a multiple of 16. If the value is not a multiple of 16, then the behavior is\nundefined. The memory range <code class=\"docutils literal notranslate\"><span class=\"pre\">[dstMem,</span> <span class=\"pre\">dstMem</span> <span class=\"pre\">+</span> <span class=\"pre\">size</span> <span class=\"pre\">-</span> <span class=\"pre\">1]</span></code> must not overflow the destination memory\nspace and the memory range <code class=\"docutils literal notranslate\"><span class=\"pre\">[srcMem,</span> <span class=\"pre\">srcMem</span> <span class=\"pre\">+</span> <span class=\"pre\">size</span> <span class=\"pre\">-</span> <span class=\"pre\">1]</span></code> must not overflow the source memory\nspace. Otherwise, the behavior is undefined. The address <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> must be aligned to 16 bytes.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.bulk.prefetch.L2.global                 [srcMem], size;\n\ncp.async.bulk.prefetch.L2.global.L2::cache_hint  [srcMem], size, policy;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor</h1><section id=\"data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor\">\n\n\n<p>Provides a hint to the system to initiate the asynchronous prefetch of tensor data to the cache.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// global -&gt; shared::cluster:\ncp.async.bulk.prefetch.tensor.dim.L2.src{.load_mode}{.level::cache_hint} [tensorMap, tensorCoords]\n                                                             {, im2colOffsets } {, cache-policy}\n\n.src =                { .global }\n.dim =                { .1d, .2d, .3d, .4d, .5d }\n.load_mode =          { .tile, .im2col }\n.level::cache_hint =  { .L2::cache_hint }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.prefetch.tensor</span></code> is a non-blocking instruction which may initiate an asynchronous\nprefetch of tensor data from the location in <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> statespace to the L2 cache.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> is the generic address of the opaque tensor-map object which resides\neither in <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space or <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code> space or <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> space. The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> specifies\nthe properties of the tensor copy operation, as described in <a class=\"reference external\" href=\"#tensor-tensormap\">Tensor-map</a>.\nThe <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> is accessed in tensormap proxy. Refer to the <em>CUDA programming guide</em> for creating\nthe tensor-map objects on the host side.</p>\n<p>The dimension of the tensor data is specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code> modifier.</p>\n<p>The vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> specifies the starting coordinates in the tensor data in the\nglobal memory from or to which the copy operation has to be performed. The number of tensor\ncoordinates in the vector argument <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> should be equal to the dimension specified by\nthe modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code>. The individual tensor coordinates in <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> are of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> specifies how the data in the source location is copied into the\ndestination location. If <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> is not specified, it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.tile</span></code>. In <code class=\"docutils literal notranslate\"><span class=\"pre\">.tile</span></code>\nmode, the multi-dimensional layout of the source tensor is preserved at the destination.\nIn <code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col</span></code> mode, some dimensions of the source tensors are unrolled in a single dimensional column\nat the destination. Details of the <code class=\"docutils literal notranslate\"><span class=\"pre\">im2col</span></code> mode are described in <a class=\"reference external\" href=\"#tensor-im2col-mode\">Im2col mode</a>.\nIn <code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col</span></code> mode, the tensor has to be at least 3-dimensional. The vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">im2colOffsets</span></code> can\nbe specified only when <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col</span></code>. The length of the vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">im2colOffsets</span></code>\nis two less than the number of dimension <code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code> of the tensor operation.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.prefetch.tensor</span></code> is treated as a weak memory operation in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency\nModel</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b16 ctaMask;\n.reg .u16 i2cOffW, i2cOffH, i2cOffD;\n.reg .b64 l2CachePolicy;\n\ncp.async.bulk.prefetch.tensor.1d.L2.global.tile  [tensorMap0, {tc0}];\n\n@p cp.async.bulk.prefetch.tensor.2d.L2.global    [tensorMap1, {tc0, tc1}];\n\n@p cp.async.bulk.prefetch.tensor.5d.L2.global.im2col\n                      [tensorMap2, {tc0, tc1, tc2, tc3, tc4}], {i2cOffW, i2cOffH, i2cOffD};\n\n@p cp.async.bulk.prefetch.tensor.3d.L2.global.im2col.L2::cache_hint\n                      [tensorMap3, {tc0, tc1, tc2}], {i2cOffW}, policy;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.bulk.tensor</h1><section id=\"data-movement-and-conversion-instructions-cp-async-bulk-tensor\">\n\n\n<p>Initiates an asynchronous copy operation on the tensor data from one state space to another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// global -&gt; shared::cluster:\ncp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.multicast}{.level::cache_hint}\n                                   [dstMem], [tensorMap, tensorCoords], [mbar]{, im2colOffsets}\n                                   {, ctaMask} {, cache-policy}\n\n.dst =                  { .shared::cluster }\n.src =                  { .global }\n.dim =                  { .1d, .2d, .3d, .4d, .5d }\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n.load_mode =            { .tile, .im2col }\n.level::cache_hint =    { .L2::cache_hint }\n.multicast =            { .multicast::cluster  }\n\n\n// shared::cta -&gt; global:\ncp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.level::cache_hint}\n                                   [tensorMap, tensorCoords], [srcMem] {, cache-policy}\n\n.dst =                  { .global }\n.src =                  { .shared::cta }\n.dim =                  { .1d, .2d, .3d, .4d, .5d }\n.completion_mechanism = { .bulk_group }\n.load_mode =            { .tile, .im2col_no_offs }\n.level::cache_hint =    { .L2::cache_hint }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.tensor</span></code> is a non-blocking instruction which initiates an asynchronous copy\noperation of tensor data from the location in <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> state space to the location in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code>\nstate space.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code> specifies the location in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code> state space into which the tensor data\nhas to be copied and <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> specifies the location in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> state space from which the\ntensor data has to be copied.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> is the generic address of the opaque tensor-map object which resides\neither in <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space or <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code> space or <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> space. The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> specifies\nthe properties of the tensor copy operation, as described in <a class=\"reference external\" href=\"#tensor-tensormap\">Tensor-map</a>.\nThe <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> is accessed in tensormap proxy. Refer to the <em>CUDA programming guide</em> for creating\nthe tensor-map objects on the host side.</p>\n<p>The dimension of the tensor data is specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code> modifier.</p>\n<p>The vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> specifies the starting coordinates in the tensor data in the\nglobal memory from or to which the copy operation has to be performed. The number of tensor\ncoordinates in the vector argument <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> should be equal to the dimension specified by\nthe modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code>. The individual tensor coordinates in <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> are of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.completion_mechanism</span></code> specifies the completion mechanism that is supported on the\ninstruction variant. The completion mechanisms that are supported for different variants are\nsummarized in the following table:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 20%\"/>\n<col style=\"width: 20%\"/>\n<col style=\"width: 17%\"/>\n<col style=\"width: 43%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>Completion mechanism</p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code></p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier::...</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td><p>mbarrier based completion mechanism</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code></p></td>\n<td><p><em>Bulk async-group</em> based completion mechanism</p></td>\n</tr>\n</tbody>\n</table>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier::complete_tx::bytes</span></code> specifies that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.tensor</span></code> variant\nuses mbarrier based completion mechanism. The <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation, with <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument equal to amount of data copied in bytes, will be\nperformed on the mbarrier object specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>.</p>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code> specifies that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.tensor</span></code> variant uses <em>bulk\nasync-group</em> based completion mechanism.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> specifies how the data in the source location is copied into the\ndestination location. If <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> is not specified, it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.tile</span></code>. In <code class=\"docutils literal notranslate\"><span class=\"pre\">.tile</span></code>\nmode, the multi-dimensional layout of the source tensor is preserved at the destination. In\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col</span></code> mode, some dimensions of the source tensors are unrolled in a single dimensional column\nat the destination. Details of the <code class=\"docutils literal notranslate\"><span class=\"pre\">im2col</span></code> mode are described in <a class=\"reference external\" href=\"#tensor-im2col-mode\">Im2col mode</a>. In <code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col</span></code> mode, the tensor has to be at least\n3-dimensional. The vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">im2colOffsets</span></code> can be specified only when <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> is\n.im2col. The length of the vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">im2colOffsets</span></code> is two less than the number of dimension\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code> of the tensor operation. The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col_no_offs</span></code> is the same as <code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col</span></code> mode\nexcept there is no <code class=\"docutils literal notranslate\"><span class=\"pre\">im2colOffsets</span></code> vector involved.</p>\n<p>The optional modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.multicast::cluster</span></code> allows copying of data from global memory to shared\nmemory of multiple CTAs in the cluster. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">ctaMask</span></code> specifies the destination CTAs in the\ncluster such that each bit position in the 16-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">ctaMask</span></code> operand corresponds to the <code class=\"docutils literal notranslate\"><span class=\"pre\">%ctaid</span></code>\nof the destination CTA. The source data is multicast to the same offset as <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code> in the shared\nmemory of each destination CTA. The mbarrier signal is also multicast to the same offset as <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>\nin the shared memory of the destination CTA.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program. The\nqualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported when at least one of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code>\nstatespaces is <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p>The copy operation in <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.tensor</span></code> is treated as a weak memory operation and the\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the mbarrier has <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> semantics at the <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope as described in the\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.multicast::cluster</span></code> qualifier is optimized for target architecture <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90a</span></code> and may have\nsubstantially reduced performance on other targets and hence <code class=\"docutils literal notranslate\"><span class=\"pre\">.multicast::cluster</span></code> is advised to\nbe used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span></code> <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90a</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.multicast::cluster</span></code> qualifier advised to be used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span></code> <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90a</span></code>.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b16 ctaMask;\n.reg .u16 i2cOffW, i2cOffH, i2cOffD;\n.reg .b64 l2CachePolicy;\n\ncp.async.bulk.tensor.1d.shared::cluster.global.tile  [sMem0], [tensorMap0, {tc0}], [mbar0];\n\n@p cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster\n                     [sMem1], [tensorMap1, {tc0, tc1}], [mbar2], ctaMask;\n\n@p cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes\n                     [sMem2], [tensorMap2, {tc0, tc1, tc2, tc3, tc4}], [mbar2], {i2cOffW, i2cOffH, i2cOffD};\n\n@p cp.async.bulk.tensor.3d.im2col.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint\n                     [sMem3], [tensorMap3, {tc0, tc1, tc2}], [mbar3], {i2cOffW}, policy;\n\n@p cp.async.bulk.tensor.1d.global.shared::cta.bulk_group  [tensorMap3, {tc0}], [sMem3];\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.bulk.wait_group</h1><section id=\"data-movement-and-conversion-instructions-cp-async-bulk-wait-group\">\n\n\n<p>Wait for completion of <em>bulk async-groups</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.bulk.wait_group{.read} N;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.wait_group</span></code> instruction will cause the executing thread to wait until only N or\nfewer of the most recent <em>bulk async-groups</em> are pending and all the prior <em>bulk async-groups</em>\ncommitted by the executing threads are complete. For example, when N is 0, the executing thread\nwaits on all the prior <em>bulk async-groups</em> to complete. Operand N is an integer constant.</p>\n<p>By default, <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.wait_group</span></code> instruction will cause the executing thread to wait till\nall the bulk async operations in the specified <em>bulk async-group</em> have completed all of the\nfollowing:</p>\n<ul class=\"simple\">\n<li><p>Reading from the source locations.</p></li>\n<li><p>Writing to their respective destination locations.</p></li>\n<li><p>Writes being made visible to the executing thread.</p></li>\n</ul>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.read</span></code> modifier indicates that the waiting has to be done until all the bulk async\noperations in the specified <em>bulk async-group</em> have completed reading from their source locations.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.bulk.wait_group.read   0;\ncp.async.bulk.wait_group        2;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.commit_group</h1><section id=\"data-movement-and-conversion-instructions-cp-async-commit-group\">\n\n\n<p>Commits all prior initiated but uncommitted <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> instructions into a <em>cp.async-group</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.commit_group ;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.commit_group</span></code> instruction creates a new <em>cp.async-group</em> per thread and batches all\nprior <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> instructions initiated by the executing thread but not committed to any\n<em>cp.async-group</em> into the new <em>cp.async-group</em>. If there are no uncommitted <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>\ninstructions then <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.commit_group</span></code> results in an empty <em>cp.async-group.</em></p>\n<p>An executing thread can wait for the completion of all <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations in a <em>cp.async-group</em>\nusing <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code>.</p>\n<p>There is no memory ordering guarantee provided between any two <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations within the\nsame <em>cp.async-group</em>. So two or more <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations within a <em>cp.async-group</em> copying data\nto the same location results in undefined behavior.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Example 1:\ncp.async.ca.shared.global [shrd], [gbl], 4;\ncp.async.commit_group ; // Marks the end of a cp.async group\n\n// Example 2:\ncp.async.ca.shared.global [shrd1],   [gbl1],   8;\ncp.async.ca.shared.global [shrd1+8], [gbl1+8], 8;\ncp.async.commit_group ; // Marks the end of cp.async group 1\n\ncp.async.ca.shared.global [shrd2],    [gbl2],    16;\ncp.async.cg.shared.global [shrd2+16], [gbl2+16], 16;\ncp.async.commit_group ; // Marks the end of cp.async group 2\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: cp.async.mbarrier.arrive</h1><section id=\"parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive\">\n\n\n<p>Makes the <em>mbarrier object</em> track all prior <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a> operations initiated by the\nexecuting thread.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.mbarrier.arrive{.noinc}{.shared{::cta}}.b64 [addr];\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Causes an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> to be\ntriggered by the system on the <em>mbarrier object</em> upon the completion of all prior <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a> operations initiated by the\nexecuting thread. The <em>mbarrier object</em> is at the location specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> is\nasynchronous to execution of <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.mbarrier.arrive</span></code>.</p>\n<p>When <code class=\"docutils literal notranslate\"><span class=\"pre\">.noinc</span></code> modifier is not specified, the pending count of the mbarrier object is incremented\nby 1 prior to the asynchronous <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a>. This\nresults in a zero-net change for the pending count from the asynchronous <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a> operation\nduring the current phase. The pending count of the <em>mbarrier object</em> after the increment should not\nexceed the limit as mentioned in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier object</a>. Otherwise,\nthe behavior is undefined.</p>\n<p>When the <code class=\"docutils literal notranslate\"><span class=\"pre\">.noinc</span></code> modifier is specified, the increment to the pending count of the <em>mbarrier\nobject</em> is not performed. Hence the decrement of the pending count done by the asynchronous\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> must be\naccounted for in the initialization of the <em>mbarrier object</em>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Example 1: no .noinc\nmbarrier.init.shared.b64 [shMem], threadCount;\n....\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n....\n// Absence of .noinc accounts for arrive-on from completion of prior cp.async operations.\n// So mbarrier.init must only account for arrive-on from mbarrier.arrive.\ncp.async.mbarrier.arrive.shared.b64 [shMem];\n....\nmbarrier.arrive.shared.b64 state, [shMem];\n\nwaitLoop:\nmbarrier.test_wait.shared.b64 p, [shMem], state;\n@!p bra waitLoop;\n\n\n\n// Example 2: with .noinc\n\n// Tracks arrive-on from mbarrier.arrive and cp.async.mbarrier.arrive.\n\n// All threads participating in the mbarrier perform cp.async\nmov.b32 copyOperationCnt, threadCount;\n\n// 3 arrive-on operations will be triggered per-thread\nmul.lo.u32 copyArrivalCnt, copyOperationCnt, 3;\n\nadd.u32 totalCount, threadCount, copyArrivalCnt;\n\nmbarrier.init.shared.b64 [shMem], totalCount;\n....\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n...\n// Presence of .noinc requires mbarrier initalization to have accounted for arrive-on from cp.async\ncp.async.mbarrier.arrive.noinc.shared.b64 [shMem]; // 1st instance\n....\ncp.async.ca.shared.global [shard3], [gbl3], 4;\ncp.async.ca.shared.global [shard4], [gbl4], 16;\ncp.async.mbarrier.arrive.noinc.shared::cta.b64 [shMem]; // 2nd instance\n....\ncp.async.ca.shared.global [shard5], [gbl5], 4;\ncp.async.cg.shared.global [shard6], [gbl6], 16;\ncp.async.mbarrier.arrive.noinc.shared.b64 [shMem]; // 3rd and last instance\n....\nmbarrier.arrive.shared.b64 state, [shMem];\n\nwaitLoop:\nmbarrier.test_wait.shared.b64 p, [shMem], state;\n@!p bra waitLoop;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all</h1><section id=\"data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all\">\n<span id=\"data-movement-and-conversion-instructions-cp-async-wait-group\"></span>\n\n<p>Wait for completion of prior asynchronous copy operations.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.wait_group N;\ncp.async.wait_all ;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> instruction will cause executing thread to wait till only <code class=\"docutils literal notranslate\"><span class=\"pre\">N</span></code> or fewer of\nthe most recent <em>cp.async-group</em>s are pending and all the prior <em>cp.async-group</em>s committed by\nthe executing threads are complete. For example, when <code class=\"docutils literal notranslate\"><span class=\"pre\">N</span></code> is 0, the executing thread waits on all\nthe prior <em>cp.async-group</em>s to complete. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">N</span></code> is an integer constant.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> is equivalent to :</p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.commit_group;\ncp.async.wait_group 0;\n</pre></div>\n</div>\n<p>An empty <em>cp.async-group</em> is considered to be trivially complete.</p>\n<p>Writes performed by <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations are made visible to the executing thread only after:</p>\n<ol class=\"arabic simple\">\n<li><p>The completion of <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> or</p></li>\n<li><p>The completion of <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> on the <em>cp.async-group</em> in which the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>\nbelongs to or</p></li>\n<li><p><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait\">mbarrier.test_wait</a>\nreturns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> on an <em>mbarrier object</em> which is tracking the completion of the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>\noperation.</p></li>\n</ol>\n<p>There is no ordering between two <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations that are not synchronized with\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> or <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier\">mbarrier objects</a>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> does not provide any ordering and visibility\nguarantees for any other memory operation apart from <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Example of .wait_all:\ncp.async.ca.shared.global [shrd1], [gbl1], 4;\ncp.async.cg.shared.global [shrd2], [gbl2], 16;\ncp.async.wait_all;  // waits for all prior cp.async to complete\n\n// Example of .wait_group :\ncp.async.ca.shared.global [shrd3], [gbl3], 8;\ncp.async.commit_group;  // End of group 1\n\ncp.async.cg.shared.global [shrd4], [gbl4], 16;\ncp.async.commit_group;  // End of group 2\n\ncp.async.cg.shared.global [shrd5], [gbl5], 16;\ncp.async.commit_group;  // End of group 3\n\ncp.async.wait_group 1;  // waits for group 1 and group 2 to complete\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all</h1><section id=\"data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all\">\n<span id=\"data-movement-and-conversion-instructions-cp-async-wait-group\"></span>\n\n<p>Wait for completion of prior asynchronous copy operations.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.wait_group N;\ncp.async.wait_all ;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> instruction will cause executing thread to wait till only <code class=\"docutils literal notranslate\"><span class=\"pre\">N</span></code> or fewer of\nthe most recent <em>cp.async-group</em>s are pending and all the prior <em>cp.async-group</em>s committed by\nthe executing threads are complete. For example, when <code class=\"docutils literal notranslate\"><span class=\"pre\">N</span></code> is 0, the executing thread waits on all\nthe prior <em>cp.async-group</em>s to complete. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">N</span></code> is an integer constant.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> is equivalent to :</p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.commit_group;\ncp.async.wait_group 0;\n</pre></div>\n</div>\n<p>An empty <em>cp.async-group</em> is considered to be trivially complete.</p>\n<p>Writes performed by <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations are made visible to the executing thread only after:</p>\n<ol class=\"arabic simple\">\n<li><p>The completion of <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> or</p></li>\n<li><p>The completion of <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> on the <em>cp.async-group</em> in which the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>\nbelongs to or</p></li>\n<li><p><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait\">mbarrier.test_wait</a>\nreturns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> on an <em>mbarrier object</em> which is tracking the completion of the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>\noperation.</p></li>\n</ol>\n<p>There is no ordering between two <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations that are not synchronized with\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> or <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier\">mbarrier objects</a>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> does not provide any ordering and visibility\nguarantees for any other memory operation apart from <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Example of .wait_all:\ncp.async.ca.shared.global [shrd1], [gbl1], 4;\ncp.async.cg.shared.global [shrd2], [gbl2], 16;\ncp.async.wait_all;  // waits for all prior cp.async to complete\n\n// Example of .wait_group :\ncp.async.ca.shared.global [shrd3], [gbl3], 8;\ncp.async.commit_group;  // End of group 1\n\ncp.async.cg.shared.global [shrd4], [gbl4], 16;\ncp.async.commit_group;  // End of group 2\n\ncp.async.cg.shared.global [shrd5], [gbl5], 16;\ncp.async.commit_group;  // End of group 3\n\ncp.async.wait_group 1;  // waits for group 1 and group 2 to complete\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.reduce.async.bulk</h1><section id=\"data-movement-and-conversion-instructions-cp-reduce-async-bulk\">\n\n\n<p>Initiates an asynchronous reduction operation.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.reduce.async.bulk.dst.src.completion_mechanism.redOp.type\n              [dstMem], [srcMem], size, [mbar]\n\n.dst =                  { .shared::cluster }\n.src =                  { .shared::cta }\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n.redOp=                 { .and, .or, .xor,\n                          .add, .inc, .dec,\n                          .min, .max }\n.type =                 { .b32, .u32, .s32, .b64, .u64 }\n\n\ncp.reduce.async.bulk.dst.src.completion_mechanism{.level::cache_hint}.redOp.type\n               [dstMem], [srcMem], size{, cache-policy}\n\n.dst =                  { .global      }\n.src =                  { .shared::cta }\n.completion_mechanism = { .bulk_group }\n.level::cache_hint    = { .L2::cache_hint }\n.redOp=                 { .and, .or, .xor,\n                          .add, .inc, .dec,\n                          .min, .max }\n.type =                 { .f16, .bf16, .b32, .u32, .s32, .b64, .u64, .s64, .f32, .f64 }\n\n\ncp.reduce.async.bulk.dst.src.completion_mechanism{.level::cache_hint}.add.noftz.type\n               [dstMem], [srcMem], size{, cache-policy}\n.dst  =                 { .global }\n.src  =                 { .shared::cta }\n.completion_mechanism = { .bulk_group }\n.type =                 { .f16, .bf16 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk</span></code> is a non-blocking instruction which initiates an asynchronous reduction\noperation on an array of memory locations specified by the destination address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code>\nwith the source array whose location is specified by the source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code>. The size\nof the source and the destination array must be the same and is specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code>.</p>\n<p>Each data element in the destination array is reduced inline with the corresponding data element in\nthe source array with the reduction operation specified by the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code>. The type of each\ndata element in the source and the destination array is specified by the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.type</span></code>.</p>\n<p>The source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> is located in the state space specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> and the\ndestination address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code> is located in the state specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code>.</p>\n<p>The 32-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> specifies the amount of memory to be copied from the source location and\nused in the reduction operation, in terms of number of bytes. <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> must be a multiple of 16. If\nthe value is not a multiple of 16, then the behavior is undefined. The memory range <code class=\"docutils literal notranslate\"><span class=\"pre\">[dstMem,</span>\n<span class=\"pre\">dstMem</span> <span class=\"pre\">+</span> <span class=\"pre\">size</span> <span class=\"pre\">-</span> <span class=\"pre\">1]</span></code> must not overflow the destination memory space and the memory range <code class=\"docutils literal notranslate\"><span class=\"pre\">[srcMem,</span>\n<span class=\"pre\">srcMem</span> <span class=\"pre\">+</span> <span class=\"pre\">size</span> <span class=\"pre\">-</span> <span class=\"pre\">1]</span></code> must not overflow the source memory space. Otherwise, the behavior is\nundefined. The addresses <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> must be aligned to 16 bytes.</p>\n<p>The operations supported by <code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code> are classified as follows:</p>\n<ul class=\"simple\">\n<li><p>The bit-size operations are <code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code>.</p></li>\n<li><p>The integer operations are <code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code>. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code> operations return a result in the range <code class=\"docutils literal notranslate\"><span class=\"pre\">[0..x]</span></code> where <code class=\"docutils literal notranslate\"><span class=\"pre\">x</span></code> is the value at the source\nstate space.</p></li>\n<li><p>The floating point operation <code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code> rounds to the nearest even. The current implementation of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.add.f32</span></code> flushes subnormal inputs and results to sign-preserving zero. The\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.add.f16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.add.bf16</span></code> operations require\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.noftz</span></code> qualifier. It preserves input and result subnormals, and does not flush them to zero.</p></li>\n</ul>\n<p>The following table describes the valid combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code> and element type:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 18%\"/>\n<col style=\"width: 24%\"/>\n<col style=\"width: 58%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code></p></th>\n<th class=\"head\"><p>Element type</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td rowspan=\"4\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code></p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code></p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code></p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code></p></td>\n</tr>\n<tr class=\"row-even\">\n<td rowspan=\"4\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code></p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code></p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code></p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code></p></td>\n</tr>\n</tbody>\n</table>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.completion_mechanism</span></code> specifies the completion mechanism that is supported on the\ninstruction variant. The completion mechanisms that are supported for different variants are\nsummarized in the following table:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 20%\"/>\n<col style=\"width: 20%\"/>\n<col style=\"width: 17%\"/>\n<col style=\"width: 43%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>Completion mechanism</p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code></p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td rowspan=\"2\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier::...</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td rowspan=\"2\"><p>mbarrier based completion mechanism</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code></p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code></p></td>\n<td><p><em>Bulk async-group</em> based completion mechanism</p></td>\n</tr>\n</tbody>\n</table>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier::complete_tx::bytes</span></code> specifies that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk</span></code> variant\nuses mbarrier based completion mechanism. The <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation, with <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument equal to amount of data copied in bytes, will be\nperformed on the mbarrier object specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>.</p>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code> specifies that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk</span></code> variant uses <em>bulk\nasync-group</em> based completion mechanism.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program. The\nqualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported when at least one of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code>\nstatespaces is <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p>Each reduction operation performed by the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk</span></code> has individually <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed.gpu</span></code>\nmemory ordering semantics. The load operations in <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk</span></code> are treated as weak\nmemory operation and the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the mbarrier has <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> semantics at the <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope as described in the\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64\n                                                                  [dstMem], [srcMem], size, [mbar];\n\ncp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32\n                                                                  [dstMem], [srcMem], size, [mbar];\n\ncp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [dstMem], [srcMem], size;\n\ncp.reduce.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.xor.s32 [dstMem], [srcMem], size, policy;\n\ncp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [dstMem], [srcMem], size;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor</h1><section id=\"data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor\">\n\n\n<p>Initiates an asynchronous reduction operation on the tensor data.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// shared::cta -&gt; global:\ncp.reduce.async.bulk.tensor.dim.dst.src.redOp{.load_mode}.completion_mechanism{.level::cache_hint}\n                                          [tensorMap, tensorCoords], [srcMem] {,cache-policy}\n\n.dst =                  { .global }\n.src =                  { .shared::cta }\n.dim =                  { .1d, .2d, .3d, .4d, .5d }\n.completion_mechanism = { .bulk_group }\n.load_mode =            { .tile, .im2col_no_offs }\n.redOp =                { .add, .min, .max, .inc, .dec, .and, .or, .xor}\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.tensor</span></code> is a non-blocking instruction which initiates an asynchronous\nreduction operation of tensor data in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code> state space with tensor data in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code>\nstate space.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> specifies the location of the tensor data in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> state space using\nwhich the reduction operation has to be performed.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> is the generic address of the opaque tensor-map object which resides\neither in <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space or <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code> space or <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> space. The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> specifies\nthe properties of the tensor copy operation, as described in <a class=\"reference external\" href=\"#tensor-tensormap\">Tensor-map</a>.\nThe <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> is accessed in tensormap proxy. Refer to the <em>CUDA programming guide</em> for creating\nthe tensor-map objects on the host side.</p>\n<p>Each element of the tensor data in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code> state space is reduced inline with the corresponding\nelement from the tensor data in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> state space. The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code> specifies the\nreduction operation used for the inline reduction. The type of each tensor data element in the\nsource and the destination tensor is specified in <a class=\"reference external\" href=\"#tensor-tensormap\">Tensor-map</a>.</p>\n<p>The dimension of the tensor is specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code> modifier.</p>\n<p>The vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> specifies the starting coordinates of the tensor data in the\nglobal memory on which the reduce operation is to be performed. The number of tensor coordinates in\nthe vector argument <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> should be equal to the dimension specified by the modifier\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code>. The individual tensor coordinates are of the type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p>The following table describes the valid combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code> and element type:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 32%\"/>\n<col style=\"width: 68%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code></p></th>\n<th class=\"head\"><p>Element type</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code></p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code></p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code></p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code></p></td>\n</tr>\n</tbody>\n</table>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.completion_mechanism</span></code> specifies the completion mechanism that is supported on the\ninstruction variant. Value <code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code> of the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.completion_mechanism</span></code> specifies that\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.tensor</span></code> instruction uses <em>bulk async-group</em> based completion mechanism.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> specifies how the data in the source location is copied into the\ndestination location. If <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> is not specified, it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.tile</span></code>. In <code class=\"docutils literal notranslate\"><span class=\"pre\">.tile</span></code>\nmode, the multi-dimensional layout of the source tensor is preserved at the destination. In\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col_no_offs</span></code> mode, some dimensions of the source tensors are unrolled in a single dimensional\ncolumn at the destination. Details of the <code class=\"docutils literal notranslate\"><span class=\"pre\">im2col</span></code> mode are described in <a class=\"reference external\" href=\"#tensor-im2col-mode\">Im2col mode</a>. In <code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col</span></code> mode, the tensor has to be at least\n3-dimensional.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program. The\nqualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported when at least one of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code>\nstatespaces is <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p>Each reduction operation performed by <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.tensor</span></code> has individually\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed.gpu</span></code> memory ordering semantics. The load operations in <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.tensor</span></code>\nare treated as weak memory operations and the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the mbarrier has <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> semantics at the <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope as described in the\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group\n                                             [tensorMap0, {tc0}], [sMem0];\n\ncp.reduce.async.bulk.tensor.2d.global.shared::cta.and.bulk_group.L2::cache_hint\n                                             [tensorMap1, {tc0, tc1}], [sMem1] , policy;\n\ncp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.im2col.bulk_group\n                                             [tensorMap2, {tc0, tc1, tc2}], [sMem2]\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Data Movement and Conversion Instructions: cp.async\n\n\n\nInitiates an asynchronous copy operation from one state space to another.\n\nSyntax\n\ncp.async.ca.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n\n                         [dst], [src], cp-size{, src-size}{, cache-policy} ;\n\ncp.async.cg.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n\n                         [dst], [src], 16{, src-size}{, cache-policy} ;\n\ncp.async.ca.shared...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk\n\n\n\nInitiates an asynchronous copy operation from one state space to another.\n\nSyntax\n\ncp.async.bulk.dst.src.completion_mechanism{.multicast}{.level::cache_hint}\n\n                      [dstMem], [srcMem], size, [mbar] {, ctaMask} {, cache-policy}\n\n.dst =                  { .shared::cluster }\n\n.src =                  { .global }\n\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n\n.level::cache_hint = ...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.commit_group\n\n\n\nCommits all prior initiated but uncommitted cp.async.bulk instructions into a\n\ncp.async.bulk-group.\n\nSyntax\n\ncp.async.bulk.commit_group;\n\nDescription\n\ncp.async.bulk.commit_group instruction creates a new per-thread bulk async-group and batches\n\nall prior cp{.reduce}.async.bulk.{.prefetch}{.tensor} instructions satisfying the following\n\nconditions into the new bulk async-group:\n\nThe prior cp{.reduce}.async...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.prefetch\n\n\n\nProvides a hint to the system to initiate the asynchronous prefetch of data to the cache.\n\nSyntax\n\ncp.async.bulk.prefetch.L2.src{.level::cache_hint}   [srcMem], size {, cache-policy}\n\n.src =                { .global }\n\n.level::cache_hint =  { .L2::cache_hint }\n\nDescription\n\ncp.async.bulk.prefetch is a non-blocking instruction which may initiate an asynchronous prefetch\n\nof data from the location specifie...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor\n\n\n\nProvides a hint to the system to initiate the asynchronous prefetch of tensor data to the cache.\n\nSyntax\n\n// global -> shared::cluster:\n\ncp.async.bulk.prefetch.tensor.dim.L2.src{.load_mode}{.level::cache_hint} [tensorMap, tensorCoords]\n\n                                                             {, im2colOffsets } {, cache-policy}\n\n.src =                { .global }\n\n.dim =                { .1d, .2d, .3...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.tensor\n\n\n\nInitiates an asynchronous copy operation on the tensor data from one state space to another.\n\nSyntax\n\n// global -> shared::cluster:\n\ncp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.multicast}{.level::cache_hint}\n\n                                   [dstMem], [tensorMap, tensorCoords], [mbar]{, im2colOffsets}\n\n                                   {, ctaMask} {, cache-policy}\n\n.dst =      ...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.wait_group\n\n\n\nWait for completion of bulk async-groups.\n\nSyntax\n\ncp.async.bulk.wait_group{.read} N;\n\nDescription\n\ncp.async.bulk.wait_group instruction will cause the executing thread to wait until only N or\n\nfewer of the most recent bulk async-groups are pending and all the prior bulk async-groups\n\ncommitted by the executing threads are complete. For example, when N is 0, the executing thread\n\nwaits on all the prior b...\n\n=====Data Movement and Conversion Instructions: cp.async.commit_group\n\n\n\nCommits all prior initiated but uncommitted cp.async instructions into a cp.async-group.\n\nSyntax\n\ncp.async.commit_group ;\n\nDescription\n\ncp.async.commit_group instruction creates a new cp.async-group per thread and batches all\n\nprior cp.async instructions initiated by the executing thread but not committed to any\n\ncp.async-group into the new cp.async-group. If there are no uncommitted cp.async\n\ninstructio...\n\n=====Parallel Synchronization and Communication Instructions: cp.async.mbarrier.arrive\n\n\n\nMakes the mbarrier object track all prior cp.async operations initiated by the\n\nexecuting thread.\n\nSyntax\n\ncp.async.mbar ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async"
            };

        case "createpolicy":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy\" target=\"_blank\" rel=\"noopener noreferrer\">createpolicy <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: createpolicy</h1><section id=\"data-movement-and-conversion-instructions-createpolicy\">\n\n\n<p>Create a cache eviction policy for the specified cache level.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Range-based policy\ncreatepolicy.range{.global}.level::primary_priority{.level::secondary_priority}.b64\n                                   cache-policy, [a], primary-size, total-size;\n\n// Fraction-based policy\ncreatepolicy.fractional.level::primary_priority{.level::secondary_priority}.b64\n                                   cache-policy{, fraction};\n\n// Converting the access property from CUDA APIs\ncreatepolicy.cvt.L2.b64            cache-policy, access-property;\n\n.level::primary_priority =   { .L2::evict_last, .L2::evict_normal,\n                               .L2::evict_first, .L2::evict_unchanged };\n.level::secondary_priority = { .L2::evict_first, .L2::evict_unchanged };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">createpolicy</span></code> instruction creates a cache eviction policy for the specified cache level in an\nopaque 64-bit register specified by the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code>. The cache eviction\npolicy specifies how cache eviction priorities are applied to global memory addresses used in memory\noperations with <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier.</p>\n<p>There are two types of cache eviction policies:</p>\n<ul>\n<li>\n<p>Range-based policy</p>\n<p>The cache eviction policy created using <code class=\"docutils literal notranslate\"><span class=\"pre\">createpolicy.range</span></code> specifies the cache eviction\nbehaviors for the following three address ranges:</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">[a</span> <span class=\"pre\">..</span> <span class=\"pre\">a</span> <span class=\"pre\">+</span> <span class=\"pre\">(primary-size</span> <span class=\"pre\">-</span> <span class=\"pre\">1)]</span></code> referred to as primary range.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">[a</span> <span class=\"pre\">+</span> <span class=\"pre\">primary-size</span> <span class=\"pre\">..</span> <span class=\"pre\">a</span> <span class=\"pre\">+</span> <span class=\"pre\">(total-size</span> <span class=\"pre\">-</span> <span class=\"pre\">1)]</span></code> referred to as trailing secondary range.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">[a</span> <span class=\"pre\">-</span> <span class=\"pre\">(total-size</span> <span class=\"pre\">-</span> <span class=\"pre\">primary-size)</span> <span class=\"pre\">..</span> <span class=\"pre\">(a</span> <span class=\"pre\">-</span> <span class=\"pre\">1)]</span></code> referred to as preceding secondary range.</p></li>\n</ul>\n<p>When a range-based cache eviction policy is used in a memory operation with\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier, the eviction priorities are applied as follows:</p>\n<ul class=\"simple\">\n<li><p>If the memory address falls in the primary range, the eviction priority specified by\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::primary_priority</span></code> is applied.</p></li>\n<li><p>If the memory address falls in any of the secondary ranges, the eviction priority specified by\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::secondary_priority</span></code> is applied.</p></li>\n<li><p>If the memory address does not fall in either of the above ranges, then the applied eviction\npriority is unspecified.</p></li>\n</ul>\n<p>The 32-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">primary-size</span></code> specifies the size, in bytes, of the primary range. The\n32-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">total-size</span></code> specifies the combined size, in bytes, of the address range\nincluding primary and secondary ranges. The value of <code class=\"docutils literal notranslate\"><span class=\"pre\">primary-size</span></code> must be less than or equal\nto the value of <code class=\"docutils literal notranslate\"><span class=\"pre\">total-size</span></code>. Maximum allowed value of <code class=\"docutils literal notranslate\"><span class=\"pre\">total-size</span></code> is 4GB.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::secondary_priority</span></code> is not specified, then it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::evict_unchanged</span></code>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the specified address does not fall within the address window of <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space\nthen the behavior is undefined.</p>\n</li>\n<li>\n<p>Fraction-based policy</p>\n<p>A memory operation with <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier can use the fraction-based cache\neviction policy to request the cache eviction priority specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2:primary_priority</span></code> to\nbe applied to a fraction of cache accesses specified by the 32-bit floating point operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">fraction</span></code>. The remainder of the cache accesses get the eviction priority specified by\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::secondary_priority</span></code>. This implies that in a memory operation that uses a fraction-based\ncache policy, the memory access has a probability specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">fraction</span></code> of\ngetting the cache eviction priority specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::primary_priority</span></code>.</p>\n<p>The valid range of values for the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">fraction</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">(0.0,..,</span> <span class=\"pre\">1.0]</span></code>. If the operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">fraction</span></code> is not specified, it defaults to 1.0.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::secondary_priority</span></code> is not specified, then it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::evict_unchanged</span></code>.</p>\n</li>\n</ul>\n<p>The access property created using the CUDA APIs can be converted into cache eviction policy by the\ninstruction <code class=\"docutils literal notranslate\"><span class=\"pre\">createpolicy.cvt</span></code>. The source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">access-property</span></code> is a 64-bit opaque\nregister. Refer to <em>CUDA programming guide</em> for more details.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.4.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>createpolicy.fractional.L2::evict_last.b64                      policy, 1.0;\ncreatepolicy.fractional.L2::evict_last.L2::evict_unchanged.b64  policy, 0.5;\n\ncreatepolicy.range.L2::evict_last.L2::evict_first.b64\n                                            policy, [ptr], 0x100000, 0x200000;\n\n// access-prop is created by CUDA APIs.\ncreatepolicy.cvt.L2.b64 policy, access-prop;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Create a cache eviction policy for the specified cache level.\n\nSyntax\n\n// Range-based policy\n\ncreatepolicy.range{.global}.level::primary_priority{.level::secondary_priority}.b64\n\n                                   cache-policy, [a], primary-size, total-size;\n\n// Fraction-based policy\n\ncreatepolicy.fractional.level::primary_priority{.level::secondary_priority}.b64\n\n                                   cache-policy{, fraction};\n\n// Converting the access property from CUDA APIs\n\ncreatepolicy.cvt.L2.b64            cache-policy, access-property;\n\n.level::primary_priority =   { .L2::evict_last, .L2::evict_normal,\n\n                               .L2::evict_first, .L2::evict_unchanged };\n\n.level::secondary_priority = { .L2::evict_first, .L2::evict_unchanged };\n\nDescription\n\nThe createpolicy instruction creates a cache eviction policy for the specified cache level in an\n\nopaque 64-bit register specified by the destination operand cache-policy. The cache eviction\n\npolicy specifies how cache eviction priorities are applied to global memory addresses used in memory\n\noperations with .level::cache_hint qualifier.\n\nThere are two types of cache eviction policies:\n\nRange-based policy\n\nThe cache eviction policy created using createpolicy.range specifies the cache eviction\n\nbehaviors for the following three address ranges:\n\n[a .. a + (primary-size - 1)] referred to as primary range.\n\n[a + primary-size .. a + (total-size - 1)] referred to as trailing secondary range.\n\n[a - (total-size - primary-size) .. (a - 1)] referred to as preceding secondary range.\n\nWhen a range-based cache eviction policy is used in a memory operation with\n\n.level::cache_hint qualifier, the eviction priorities are applied as follows:\n\nIf the memory address falls in the primary range, the eviction priority specified by\n\n.L2::primary_priority is applied.\n\nIf the memory address falls in any of the secondary ranges, the eviction priority specified by\n\n.L2::secondary_priority is applied.\n\nIf the memory address does not fall in either of the above ranges, then the applied eviction\n\npriority is unspecified.\n\nThe 32-bit operand primary-size specifies the size, in bytes, of the primary range. The\n\n32-bit operand total-size specifies the combined size, in bytes, of the address range\n\nincluding primary and secondary ranges. The value of primary-size must be less than or equal\n\nto the value of total-size. Maximum allowed value of total-size is 4GB.\n\nIf .L2::secondary_priority is not specified, then it defaults to .L2::evict_unchanged.\n\nIf no state space is specified then Generic Addressing is\n\nused. If the specified address does not fall within the address window of .global state space\n\nthen the behavior is undefined.\n\nFraction-based policy\n\nA memory operation with .level::cache_hint qualifier can use the fraction-based cache\n\neviction policy to request the cache eviction priority specified by .L2:primary_priority to\n\nbe applied to a fraction of cache accesses specified by the 32-bit floating point operand\n\nfraction. The remainder of the cache accesses get the eviction priority specified by\n\n.L2::secondary_priority. This implies that in a memory operation that uses a fraction-based\n\ncache policy, the memory access has a probability specified by the operand fraction of\n\ngetting the cache eviction priority specified by .L2::primary_priority.\n\nThe valid range of values for the operand fraction is (0.0,.., 1.0]. If the operand\n\nfraction is not specified, it defaults to 1.0.\n\nIf .L2::secondary_priority is not specified, then it defaults to .L2::evict_unchanged.\n\nThe access property created using the CUDA APIs can be converted into cache eviction policy by the\n\ninstruction createpolicy.cvt. The source operand access-property is a 64-bit opaque\n\nregister. Refer to CUDA programming guide for more details.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.4.\n\nTarget ISA Notes\n\nRequires sm_80 or higher.\n\nExamples\n\ncreatepolicy.fractional.L2::evict_last.b64                      policy, 1.0;\n\ncreatepolicy.fractional.L2::evict_last.L2::evict_unchanged.b64  polic ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy"
            };

        case "ctaid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-ctaid\" target=\"_blank\" rel=\"noopener noreferrer\">ctaid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %ctaid</h1><section id=\"special-registers-ctaid\">\n\n\n<p>CTA identifier within a grid.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %ctaid;                      // CTA id vector\n.sreg .u32 %ctaid.x, %ctaid.y, %ctaid.z;    // CTA id components\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the CTA identifier within the CTA\ngrid. The <code class=\"docutils literal notranslate\"><span class=\"pre\">%ctaid</span></code> special register contains a 1D, 2D, or 3D vector, depending on the shape and\nrank of the CTA grid. The fourth element is unused and always returns zero.</p>\n<p>It is guaranteed that:</p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>0  &lt;=  %ctaid.x &lt;  %nctaid.x\n0  &lt;=  %ctaid.y &lt;  %nctaid.y\n0  &lt;=  %ctaid.z &lt;  %nctaid.z\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0 with type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u16</span></code>.</p>\n<p>Redefined as type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u32</span></code> in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instructions may be used to read the lower 16-bits of each component of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%ctaid</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32  %r0,%ctaid.x;\nmov.u16  %rh,%ctaid.y;   // legacy code\n</pre></div>\n</div>\n</section>",
                "tooltip": "CTA identifier within a grid.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %ctaid;                      // CTA id vector\n\n.sreg .u32 %ctaid.x, %ctaid.y, %ctaid.z;    // CTA id components\n\nDescription\n\nA predefined, read-only special register initialized with the CTA identifier within the CTA\n\ngrid. The %ctaid special register contains a 1D, 2D, or 3D vector, depending on the shape and\n\nrank of the CTA grid. The fourth element is unused and always returns zero.\n\nIt is guaranteed that:\n\n0  <=  %ctaid.x <  %nctaid.x\n\n0  <=  %ctaid.y <  %nctaid.y\n\n0  <=  %ctaid.z <  %nctaid.z\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 with type .v4.u16.\n\nRedefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n\n%ctaid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32  %r0,%ctaid.x;\n\nmov.u16  %rh,%ctaid.y;   // legacy code\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-ctaid"
            };

        case "current_graph_exec":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-current-graph-exec\" target=\"_blank\" rel=\"noopener noreferrer\">current_graph_exec <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %current_graph_exec</h1><section id=\"special-registers-current-graph-exec\">\n\n\n<p>An Identifier for currently executing CUDA device graph.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u64 %current_graph_exec;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the identifier referring to the CUDA\ndevice graph being currently executed. This register is 0 if the executing kernel is not part of a\nCUDA device graph.</p>\n<p>Refer to the <em>CUDA Programming Guide</em> for more details on CUDA device graphs.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_50</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u64  r1, %current_graph_exec;\n</pre></div>\n</div>\n</section>",
                "tooltip": "An Identifier for currently executing CUDA device graph.\n\nSyntax (predefined)\n\n.sreg .u64 %current_graph_exec;\n\nDescription\n\nA predefined, read-only special register initialized with the identifier referring to the CUDA\n\ndevice graph being currently executed. This register is 0 if the executing kernel is not part of a\n\nCUDA device graph.\n\nRefer to the CUDA Programming Guide for more details on CUDA device graphs.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nRequires sm_50 or higher.\n\nExamples\n\nmov.u64  r1, %current_graph_exec;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-current-graph-exec"
            };

        case "cvt":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt\" target=\"_blank\" rel=\"noopener noreferrer\">cvt <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack\" target=\"_blank\" rel=\"noopener noreferrer\">cvt.pack <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: cvt</h1><section id=\"data-movement-and-conversion-instructions-cvt\">\n\n\n<p>Convert a value from one type to another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cvt{.irnd}{.ftz}{.sat}.dtype.atype         d, a;  // integer rounding\ncvt{.frnd}{.ftz}{.sat}.dtype.atype         d, a;  // fp rounding\ncvt.frnd2{.relu}{.satfinite}.f16.f32       d, a;\ncvt.frnd2{.relu}{.satfinite}.f16x2.f32     d, a, b;\ncvt.frnd2{.relu}{.satfinite}.bf16.f32      d, a;\ncvt.frnd2{.relu}{.satfinite}.bf16x2.f32    d, a, b;\ncvt.rna{.satfinite}.tf32.f32               d, a;\ncvt.frnd2{.relu}.tf32.f32                  d, a;\ncvt.rn.satfinite{.relu}.f8x2type.f32       d, a, b;\ncvt.rn.satfinite{.relu}.f8x2type.f16x2     d, a;\ncvt.rn.{.relu}.f16x2.f8x2type              d, a;\n\n.irnd   = { .rni, .rzi, .rmi, .rpi };\n.frnd   = { .rn,  .rz,  .rm,  .rp  };\n.frnd2  = { .rn,  .rz };\n.dtype = .atype = { .u8,   .u16, .u32, .u64,\n                    .s8,   .s16, .s32, .s64,\n                    .bf16, .f16, .f32, .f64 };\n.f8x2type = { .e4m3x2, .e5m2x2 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Convert between different types and sizes.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, two inputs <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> of <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> type are\nconverted into <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> type and the converted values are packed in the destination\nregister <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, such that the value converted from input <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is stored in the upper half of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>\nand the value converted from input <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is stored in the lower half of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code></p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> instruction type, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type,\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code> instruction type, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p>When converting to <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> data formats, the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code>\ntype. When converting two <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> inputs to <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code>, each input is converted to the\nspecified format, and the converted values are packed in the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> such that the\nvalue converted from input <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is stored in the upper 8 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and the value converted from\ninput <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is stored in the lower 8 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. When converting an <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> input to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/ <code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code>, each <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> input from operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is converted to the specified\nformat. The converted values are packed in the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> such that the value\nconverted from the upper 16 bits of input <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is stored in the upper 8 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and the value\nconverted from the lower 16 bits of input <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is stored in the lower 8 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>When converting from <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code>\ntype. Each 8-bit input value in operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is converted to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> type. The converted values\nare packed in the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> such that the value converted from the upper 8 bits of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is stored in the upper 16 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and the value converted from the lower 8 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>\nis stored in the lower 16 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>Rounding modifier is mandatory in all of the following cases:</p>\n<ul class=\"simple\">\n<li><p>float-to-float conversions, when destination type is smaller than source type</p></li>\n<li><p>All float-to-int conversions</p></li>\n<li><p>All int-to-float conversions</p></li>\n<li><p>All conversions involving <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2,</span> <span class=\"pre\">.e5m2x2,</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code> instruction\ntypes.</p></li>\n</ul>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.satfinite</span></code> modifier is only supported for conversions involving the following types:</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> destination types. <code class=\"docutils literal notranslate\"><span class=\"pre\">.satfinite</span></code> modifier is mandatory for such\nconversions.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> as destination types.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code> as destination type with rounding mode specified as round to nearest, ties away from\nzero.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (/* inst type is .f16x2 or .bf16x2 */) {\n    d[31:16] = convert(a);\n    d[15:0]  = convert(b);\n} else {\n    d = convert(a);\n}\n</pre></div>\n</div>\n<p><strong>Integer Notes</strong></p>\n<p>Integer rounding is required for float-to-integer conversions, and for same-size float-to-float\nconversions where the value is rounded to an integer. Integer rounding is illegal in all other\ninstances.</p>\n<p>Integer rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rni</span></code></dt>\n<dd>\n<p>round to nearest integer, choosing even integer if source is equidistant between two integers</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rzi</span></code></dt>\n<dd>\n<p>round to nearest integer in the direction of zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rmi</span></code></dt>\n<dd>\n<p>round to nearest integer in direction of negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rpi</span></code></dt>\n<dd>\n<p>round to nearest integer in direction of positive infinity</p>\n</dd>\n</dl>\n<p>In float-to-integer conversion, <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> inputs are converted to 0.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.ftz.dtype.f32</span></code> float-to-integer conversions and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.ftz.f32.f32</span></code> float-to-float\nconversions with integer rounding, subnormal inputs are flushed to sign-preserving zero. Modifier\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> can only be specified when either <code class=\"docutils literal notranslate\"><span class=\"pre\">.dtype</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.atype</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> and applies only\nto single precision (<code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>) inputs and results.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.ftz.dtype.f32</span></code> float-to-integer conversions and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.ftz.f32.f32</span></code>\nfloat-to-float conversions with integer rounding, subnormal inputs are flushed to sign-preserving\nzero. The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> modifier may be specified in these cases for clarity.</p>\n<p><strong>Note:</strong> In PTX ISA versions 1.4 and earlier, the <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instruction did not flush single-precision\nsubnormal inputs or results to zero if the destination type size was 64-bits. The compiler will\npreserve this behavior for legacy PTX code.</p>\n</dd>\n</dl>\n<p>Saturation modifier:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code></dt>\n<dd>\n<p>For integer destination types, <code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code> limits the result to <code class=\"docutils literal notranslate\"><span class=\"pre\">MININT..MAXINT</span></code> for the size of\nthe operation. Note that saturation applies to both signed and unsigned integer types.</p>\n<p>The saturation modifier is allowed only in cases where the destination type\u2019s value range is not\na superset of the source type\u2019s value range; i.e., the <code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code> modifier is illegal in cases\nwhere saturation is not possible based on the source and destination types.</p>\n<p>For float-to-integer conversions, the result is clamped to the destination range by default; i.e,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code> is redundant.</p>\n</dd>\n</dl>\n<p><strong>Floating Point Notes</strong></p>\n<p>Floating-point rounding is required for float-to-float conversions that result in loss of precision,\nand for integer-to-float conversions. Floating-point rounding is illegal in all other instances.</p>\n<p>Floating-point rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt>\n<dd>\n<p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rna</span></code></dt>\n<dd>\n<p>mantissa LSB rounds to nearest, ties away from zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>A floating-point value may be rounded to an integral value using the integer rounding modifiers (see\nInteger Notes). The operands must be of the same size. The result is an integral value, stored in\nfloating-point format.</p>\n<p>Subnormal numbers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported. Modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> may be specified to flush\nsingle-precision subnormal inputs and results to sign-preserving zero. Modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> can only\nbe specified when either <code class=\"docutils literal notranslate\"><span class=\"pre\">.dtype</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.atype</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> and applies only to single\nprecision (<code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>) inputs and results.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p>Single-precision subnormal inputs and results are flushed to sign-preserving zero. The optional\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> modifier may be specified in these cases for clarity.</p>\n</dd>\n</dl>\n<p><strong>Note:</strong> In PTX ISA versions 1.4 and earlier, the <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instruction did not flush\nsingle-precision subnormal inputs or results to zero if either source or destination type was\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code>. The compiler will preserve this behavior for legacy PTX code. Specifically, if the PTX\nISA version is 1.4 or earlier, single-precision subnormal inputs and results are flushed to\nsign-preserving zero only for <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.f32.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.f16.f32</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.f32.f32</span></code> instructions.</p>\n<p>Saturation modifier:</p>\n<dl class=\"simple\">\n<dt>\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code>:</dt>\n<dd>\n<p>For floating-point destination types, <code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code> limits the result to the range [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>\nresults are flushed to positive zero. Applies to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> types.</p>\n</dd>\n<dt>\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.relu</span></code>:</dt>\n<dd>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code>\ndestination types, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relu</span></code> clamps the result to 0 if negative. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are converted to\ncanonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n</dd>\n<dt>\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.satfinite</span></code>:</dt>\n<dd>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code>\ndestination formats, if the input value is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, then the result is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> in the specified\ndestination format. If the absolute value of input (ignoring sign) is greater than <em>MAX_NORM</em> of\nthe specified destination format, then the result is sign-preserved <em>MAX_NORM</em> of the destination\nformat.</p>\n</dd>\n</dl>\n<p><strong>Notes</strong></p>\n<p>A source register wider than the specified type may be used, except when the source operand has\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> format. The lower <code class=\"docutils literal notranslate\"><span class=\"pre\">n</span></code> bits corresponding to the instruction-type width\nare used in the conversion. See <a class=\"reference external\" href=\"#operand-size-exceeding-instruction-type-size\">Operand Size Exceeding Instruction-Type Size</a> for a description of these relaxed\ntype-checking rules.</p>\n<p>A destination register wider than the specified type may be used, except when the destination\noperand has <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code> format. The result of conversion is sign-extended to\nthe destination register width for signed integers, and is zero-extended to the destination register\nwidth for unsigned, bit-size, and floating-point types. See <a class=\"reference external\" href=\"#operand-size-exceeding-instruction-type-size\">Operand Size Exceeding Instruction-Type\nSize</a> for a description of these relaxed\ntype-checking rules.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.f32.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> input yields unspecified <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.relu</span></code> modifier and {<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code>} destination formats\nintroduced in PTX ISA version 7.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.bf16.{u8/s8/u16/s16/u32/s32/u64/s64/f16/f64/bf16}</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.{u8/s8/u16/s16/u32/s32/u64/s64/f16/f64}.bf16</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.tf32.f32.{relu}.{rn/rz}</span></code> introduced\nin PTX ISA 7.8.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher introduced in PTX ISA version 7.8.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.satfinite.{e4m3x2,</span> <span class=\"pre\">e5m2x2}.{f32,</span> <span class=\"pre\">f16x2}</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher introduced in PTX ISA version 7.8.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_89</span></code> introduced in PTX ISA version 8.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.satfinite.{e4m3x2,</span> <span class=\"pre\">e5m2x2}.{f32,</span> <span class=\"pre\">f16x2}</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_89</span></code> introduced in PTX ISA version 8.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.satfinite.{f16,</span> <span class=\"pre\">bf16,</span> <span class=\"pre\">f16x2,</span> <span class=\"pre\">bf16x2,</span> <span class=\"pre\">tf32}.f32</span></code> introduced in PTX ISA version 8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> to or from <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.relu</span></code> modifier and {<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code>} destination formats require\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.bf16.{u8/s8/u16/s16/u32/s32/u64/s64/f16/f64/bf16}</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.{u8/s8/u16/s16/u32/s32/u64/s64/f16/f64}.bf16</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.tf32.f32.{relu}.{rn/rz}</span></code> require\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm89</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.satfinite.{e4m3x2,</span> <span class=\"pre\">e5m2x2}.{f32,</span> <span class=\"pre\">f16x2}</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_89</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cvt.f32.s32 f,i;\ncvt.s32.f64 j,r;     // float-to-int saturates by default\ncvt.rni.f32.f32 x,y; // round to nearest int, result is fp\ncvt.f32.f32 x,y;     // note .ftz behavior for sm_1x targets\ncvt.rn.relu.f16.f32      b, f;        // result is saturated with .relu saturation mode\ncvt.rz.f16x2.f32         b1, f, f1;   // convert two fp32 values to packed fp16 outputs\ncvt.rn.relu.satfinite.f16x2.f32    b1, f, f1;   // convert two fp32 values to packed fp16 outputs with .relu saturation on each output\ncvt.rn.bf16.f32          b, f;        // convert fp32 to bf16\ncvt.rz.relu.satfinite.bf16.f3 2    b, f;        // convert fp32 to bf16 with .relu and .satfinite saturation\ncvt.rz.satfinite.bf16x2.f32        b1, f, f1;   // convert two fp32 values to packed bf16 outputs\ncvt.rn.relu.bf16x2.f32   b1, f, f1;   // convert two fp32 values to packed bf16 outputs with .relu saturation on each output\ncvt.rna.satfinite.tf32.f32         b1, f;       // convert fp32 to tf32 format\ncvt.rn.relu.tf32.f32     d, a;        // convert fp32 to tf32 format\ncvt.f64.bf16.rp          f, b;        // convert bf16 to f64 format\ncvt.bf16.f16.rz          b, f         // convert f16 to bf16 format\ncvt.bf16.u64.rz          b, u         // convert u64 to bf16 format\ncvt.s8.bf16.rpi          s, b         // convert bf16 to s8 format\ncvt.bf16.bf16.rpi        b1, b2       // convert bf16 to corresponding int represented in bf16 format\ncvt.rn.satfinite.e4m3x2.f32 d, a, b;  // convert a, b to .e4m3 and pack as .e4m3x2 output\ncvt.rn.relu.satfinite.e5m2x2.f16x2 d, a; // unpack a and convert the values to .e5m2 outputs with .relu\n                                         // saturation on each output and pack as .e5m2x2\ncvt.rn.f16x2.e4m3x2 d, a;             // unpack a, convert two .e4m3 values to packed f16x2 output\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cvt.pack</h1><section id=\"data-movement-and-conversion-instructions-cvt-pack\">\n\n\n<p>Convert two integer values from one integer type to another and pack the results.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cvt.pack.sat.convertType.abType  d, a, b;\n    .convertType  = { .u16, .s16 }\n    .abType       = { .s32 }\n\ncvt.pack.sat.convertType.abType.cType  d, a, b, c;\n    .convertType  = { .u2, .s2, .u4, .s4, .u8, .s8 }\n    .abType       = { .s32 }\n    .cType        = { .b32 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Convert two 32-bit integers <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> into specified type and pack the results into <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is an unsigned 32-bit integer. Source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> are integers of\ntype <code class=\"docutils literal notranslate\"><span class=\"pre\">.abType</span></code> and the source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is an integer of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.cType</span></code>.</p>\n<p>The inputs <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> are converted to values of type specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.convertType</span></code> with\nsaturation and the results after conversion are packed into lower bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>If operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is specified then remaining bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> are copied from lower bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>ta = a &lt; MIN(convertType) ? MIN(convertType) : a;\nta = a &gt; MAX(convertType) ? MAX(convertType) : a;\ntb = b &lt; MIN(convertType) ? MIN(convertType) : b;\ntb = b &gt; MAX(convertType) ? MAX(convertType) : b;\n\nsize = sizeInBits(convertType);\ntd = tb ;\nfor (i = size; i &lt;= 2 * size - 1; i++) {\n    td[i] = ta[i - size];\n}\n\nif (isU16(convertType) || isS16(convertType)) {\n    d = td;\n} else {\n    for (i = 0; i &lt; 2 * size; i++) {\n        d[i] = td[i];\n    }\n    for (i = 2 * size; i &lt;= 31; i++) {\n        d[i] = c[i - 2 * size];\n    }\n}\n</pre></div>\n</div>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code> modifier limits the converted values to <code class=\"docutils literal notranslate\"><span class=\"pre\">MIN(convertType)</span></code>..<code class=\"docutils literal notranslate\"><span class=\"pre\">MAX(convertedType)</span></code> (no\noverflow) if the corresponding inputs are not in the range of datatype specified as\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.convertType</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.5.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_72</span></code> or higher.</p>\n<p>Sub byte types (<code class=\"docutils literal notranslate\"><span class=\"pre\">.u4</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.s4</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.u2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.s2</span></code>) requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_75</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cvt.pack.sat.s16.s32      %r1, %r2, %r3;           // 32-bit to 16-bit conversion\ncvt.pack.sat.u8.s32.b32   %r4, %r5, %r6, 0;        // 32-bit to 8-bit conversion\ncvt.pack.sat.u8.s32.b32   %r7, %r8, %r9, %r4;      // %r7 = { %r5, %r6, %r8, %r9 }\ncvt.pack.sat.u4.s32.b32   %r10, %r12, %r13, %r14;  // 32-bit to 4-bit conversion\ncvt.pack.sat.s2.s32.b32   %r15, %r16, %r17, %r18;  // 32-bits to 2-bit conversion\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Data Movement and Conversion Instructions: cvt\n\n\n\nConvert a value from one type to another.\n\nSyntax\n\ncvt{.irnd}{.ftz}{.sat}.dtype.atype         d, a;  // integer rounding\n\ncvt{.frnd}{.ftz}{.sat}.dtype.atype         d, a;  // fp rounding\n\ncvt.frnd2{.relu}{.satfinite}.f16.f32       d, a;\n\ncvt.frnd2{.relu}{.satfinite}.f16x2.f32     d, a, b;\n\ncvt.frnd2{.relu}{.satfinite}.bf16.f32      d, a;\n\ncvt.frnd2{.relu}{.satfinite}.bf16x2.f32    d, a, b;\n\ncvt.rna{.satfi...\n\n=====Data Movement and Conversion Instructions: cvt.pack\n\n\n\nConvert two integer values from one integer type to another and pack the results.\n\nSyntax\n\ncvt.pack.sat.convertType.abType  d, a, b;\n\n    .convertType  = { .u16, .s16 }\n\n    .abType       = { .s32 }\n\ncvt.pack.sat.convertType.abType.cType  d, a, b, c;\n\n    .convertType  = { .u2, .s2, .u4, .s4, .u8, .s8 }\n\n    .abType       = { .s32 }\n\n    .cType        = { .b32 }\n\nDescription\n\nConvert two 32-bit integers a a... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt"
            };

        case "cvta":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta\" target=\"_blank\" rel=\"noopener noreferrer\">cvta <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: cvta</h1><section id=\"data-movement-and-conversion-instructions-cvta\">\n\n\n<p>Convert address from <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code>, <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a> (<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>), <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code>\nstate space to generic, or vice-versa. Take the generic address of a variable declared in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code>, <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a> (<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>),\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// convert const, global, local, or shared address to generic address\ncvta.space.size  p, a;        // source address in register a\ncvta.space.size  p, var;      // get generic address of var\ncvta.space.size  p, var+imm;  // generic address of var+offset\n\n// convert generic address to const, global, local, or shared address\ncvta.to.space.size  p, a;\n\n.space = { .const, .global, .local, .shared{::cta, ::cluster}, .param{::entry} };\n.size  = { .u32, .u64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Convert a <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code>, <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a>\n(<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>), <code class=\"docutils literal notranslate\"><span class=\"pre\">global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">shared</span></code> address to a generic address, or vice-versa. The\nsource and destination addresses must be the same size. Use <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.u32.u64</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.u64.u32</span></code> to\ntruncate or zero-extend addresses.</p>\n<p>For variables declared in <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code>, <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a> (<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>), <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code>\nstate space, the generic address of the variable may be taken using <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta</span></code>. The source is either a\nregister or a variable defined in <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code>, <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a> (<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>), <code class=\"docutils literal notranslate\"><span class=\"pre\">global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">shared</span></code> memory\nwith an optional offset.</p>\n<p>When converting a generic address into a <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code>, <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a> (<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>), <code class=\"docutils literal notranslate\"><span class=\"pre\">global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">shared</span></code>\naddress, the resulting address is undefined in cases where the generic address does not fall within\nthe address window of the specified state space. A program may use <code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep</span></code> to guard against\nsuch incorrect behavior.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space, the address must belong to the space specified by\n<code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> sub-qualifier, otherwise the behavior is undefined. If no sub-qualifier\nis specified with <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space, then <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> is assumed by default.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> is specified without any sub-qualifiers then it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.param::entry</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvta.const</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta.to.const</span></code> introduced in PTX ISA version 3.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvta.param</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta.to.param</span></code> introduced in PTX ISA version 7.7.</p>\n<p><strong>Note:</strong> The current implementation does not allow generic pointers to <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code> space variables in\nprograms that contain pointers to constant buffers passed as kernel parameters.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> sub-qualifiers introduced in PTX ISA version 7.8.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::entry</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space introduced in PTX ISA version 8.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvta.param{::entry}</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta.to.param{::entry}</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cvta.const.u32   ptr,cvar;\ncvta.local.u32   ptr,lptr;\ncvta.shared::cta.u32  p,As+4;\ncvta.shared::cluster.u32 ptr, As;\ncvta.to.global.u32  p,gptr;\ncvta.param.u64   ptr,pvar;\ncvta.to.param::entry.u64  epptr, ptr;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Convert address from .const, Kernel Function Parameters (.param), .global, .local, or .shared\n\nstate space to generic, or vice-versa. Take the generic address of a variable declared in\n\n.const, Kernel Function Parameters (.param),\n\n.global, .local, or .shared state space.\n\nSyntax\n\n// convert const, global, local, or shared address to generic address\n\ncvta.space.size  p, a;        // source address in register a\n\ncvta.space.size  p, var;      // get generic address of var\n\ncvta.space.size  p, var+imm;  // generic address of var+offset\n\n// convert generic address to const, global, local, or shared address\n\ncvta.to.space.size  p, a;\n\n.space = { .const, .global, .local, .shared{::cta, ::cluster}, .param{::entry} };\n\n.size  = { .u32, .u64 };\n\nDescription\n\nConvert a const, Kernel Function Parameters\n\n(.param), global, local, or shared address to a generic address, or vice-versa. The\n\nsource and destination addresses must be the same size. Use cvt.u32.u64 or cvt.u64.u32 to\n\ntruncate or zero-extend addresses.\n\nFor variables declared in .const, Kernel Function Parameters (.param), .global, .local, or .shared\n\nstate space, the generic address of the variable may be taken using cvta. The source is either a\n\nregister or a variable defined in const, Kernel Function Parameters (.param), global, local, or shared memory\n\nwith an optional offset.\n\nWhen converting a generic address into a const, Kernel Function Parameters (.param), global, local, or shared\n\naddress, the resulting address is undefined in cases where the generic address does not fall within\n\nthe address window of the specified state space. A program may use isspacep to guard against\n\nsuch incorrect behavior.\n\nFor cvta with .shared state space, the address must belong to the space specified by\n\n::cta or ::cluster sub-qualifier, otherwise the behavior is undefined. If no sub-qualifier\n\nis specified with .shared state space, then ::cta is assumed by default.\n\nIf .param is specified without any sub-qualifiers then it defaults to .param::entry.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\ncvta.const and cvta.to.const introduced in PTX ISA version 3.1.\n\ncvta.param and cvta.to.param introduced in PTX ISA version 7.7.\n\nNote: The current implementation does not allow generic pointers to const space variables in\n\nprograms that contain pointers to constant buffers passed as kernel parameters.\n\nSupport for ::cta and ::cluster sub-qualifiers introduced in PTX ISA version 7.8.\n\nSupport for sub-qualifier ::entry on .param space introduced in PTX ISA version 8.3.\n\nTarget ISA Notes\n\ncvta requires sm_20 or higher.\n\ncvta.param{::entry} and cvta.to.param{::entry} requires sm_70 or higher.\n\nSub-qualifier ::cta requires sm_30 or higher.\n\nSub-qualifier ::cluster requires sm_90 or higher.\n\nExamples\n\ncvta.const.u32   ptr,cvar;\n\ncvta.local.u32   ptr,lptr;\n\ncvta.shared::cta.u32  p,As+4;\n\ncvta.shared::cluster.u32 ptr, As;\n\ncvta.to.global.u32  p,gptr;\n\ncvta.param.u64   ptr,pvar;\n\ncvta.to.param::entry.u64  epptr, ptr;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta"
            };

        case "discard":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard\" target=\"_blank\" rel=\"noopener noreferrer\">discard <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: discard</h1><section id=\"data-movement-and-conversion-instructions-discard\">\n\n\n<p>Invalidate the data in cache at the specified address and cache level.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>discard{.global}.level  [a], size;\n\n.level = { .L2 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">discard</span></code> instruction invalidates the data at the address range <code class=\"docutils literal notranslate\"><span class=\"pre\">[a</span> <span class=\"pre\">..</span> <span class=\"pre\">a</span> <span class=\"pre\">+</span> <span class=\"pre\">(size</span> <span class=\"pre\">-</span> <span class=\"pre\">1)]</span></code> in\nthe cache level specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.level</span></code> qualifier without writing back the data in the cache to\nthe memory. Therefore after the discard operation, the data at the address range <code class=\"docutils literal notranslate\"><span class=\"pre\">[a</span> <span class=\"pre\">..</span> <span class=\"pre\">a+</span> <span class=\"pre\">(size</span> <span class=\"pre\">-</span>\n<span class=\"pre\">1)]</span></code> has undetermined value.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> is an integer constant that specifies the amount of data, in bytes, in the\ncache level specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.level</span></code> qualifier to be discarded. The only supported value for the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> operand is 128.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the specified address does not fall within the address window of <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space\nthen the behavior is undefined.</p>\n<p>Supported addressing modes for address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> must be aligned to 128 bytes.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.4.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>discard.global.L2 [ptr], 128;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Invalidate the data in cache at the specified address and cache level.\n\nSyntax\n\ndiscard{.global}.level  [a], size;\n\n.level = { .L2 };\n\nDescription\n\nThe discard instruction invalidates the data at the address range [a .. a + (size - 1)] in\n\nthe cache level specified by the .level qualifier without writing back the data in the cache to\n\nthe memory. Therefore after the discard operation, the data at the address range [a .. a+ (size -\n\n1)] has undetermined value.\n\nThe operand size is an integer constant that specifies the amount of data, in bytes, in the\n\ncache level specified by the .level qualifier to be discarded. The only supported value for the\n\nsize operand is 128.\n\nIf no state space is specified then Generic Addressing is\n\nused. If the specified address does not fall within the address window of .global state space\n\nthen the behavior is undefined.\n\nSupported addressing modes for address operand a are described in Addresses as Operands. a must be aligned to 128 bytes.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.4.\n\nTarget ISA Notes\n\nRequires sm_80 or higher.\n\nExamples\n\ndiscard.global.L2 [ptr], 128;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard"
            };

        case "div":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div\" target=\"_blank\" rel=\"noopener noreferrer\">div(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div\" target=\"_blank\" rel=\"noopener noreferrer\">div(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: div</h1><section id=\"floating-point-instructions-div\">\n\n\n<p>Divide one value by another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>div.approx{.ftz}.f32  d, a, b;  // fast, approximate divide\ndiv.full{.ftz}.f32    d, a, b;  // full-range approximate divide\ndiv.rnd{.ftz}.f32     d, a, b;  // IEEE 754 compliant rounding\ndiv.rnd.f64           d, a, b;  // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Divides <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> by <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, stores result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a / b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><strong>Fast, approximate single-precision divides:</strong></p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.approx.f32</span></code> implements a fast approximation to divide, computed as <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span> <span class=\"pre\">=</span> <span class=\"pre\">a</span> <span class=\"pre\">*</span> <span class=\"pre\">(1/b)</span></code>. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">|b|</span></code> in [2<sup>-126</sup>, 2<sup>126</sup>], the maximum <code class=\"docutils literal notranslate\"><span class=\"pre\">ulp</span></code> error is 2. For 2<sup>126</sup> &lt;\n<code class=\"docutils literal notranslate\"><span class=\"pre\">|b|</span></code> &lt; 2<sup>128</sup>, if <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is infinity, <code class=\"docutils literal notranslate\"><span class=\"pre\">div.approx.f32</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, otherwise it\nreturns 0.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.full.f32</span></code> implements a relatively fast, full-range approximation that scales operands to\nachieve better accuracy, but is not fully IEEE 754 compliant and does not support rounding\nmodifiers. The maximum <code class=\"docutils literal notranslate\"><span class=\"pre\">ulp</span></code> error is 2 across the full range of inputs.</p></li>\n<li><p>Subnormal inputs and results are flushed to sign-preserving zero. Fast, approximate division by\nzero creates a value of infinity (with same sign as <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>).</p></li>\n</ul>\n<p><strong>Divide with IEEE 754 compliant rounding:</strong></p>\n<p>Rounding modifiers (no default):</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt>\n<dd>\n<p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.f32</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">div.f64</span></code> introduced in PTX ISA version 1.0.</p>\n<p>Explicit modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.full</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code>, and rounding introduced in PTX ISA version 1.4.</p>\n<p>For PTX ISA version 1.4 and later, one of <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.full</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code> is required.</p>\n<p>For PTX ISA versions 1.0 through 1.3, <code class=\"docutils literal notranslate\"><span class=\"pre\">div.f32</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">div.approx.ftz.f32</span></code>, and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">div.f64</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">div.rn.f64</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.approx.f32</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">div.full.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.rnd.f32</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.rn.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher, or <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span> <span class=\"pre\">map_f64_to_f32</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.{rz,rm,rp}.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>div.approx.ftz.f32  diam,circum,3.14159;\ndiv.full.ftz.f32    x, y, z;\ndiv.rn.f64          xd, yd, zd;\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: div</h1><section id=\"integer-arithmetic-instructions-div\">\n\n\n<p>Divide one value by another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>div.type  d, a, b;\n\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Divides <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> by <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, stores result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a / b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Division by zero yields an unspecified, machine-specific value.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>div.s32  b,n,i;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: div\n\n\n\nDivide one value by another.\n\nSyntax\n\ndiv.approx{.ftz}.f32  d, a, b;  // fast, approximate divide\n\ndiv.full{.ftz}.f32    d, a, b;  // full-range approximate divide\n\ndiv.rnd{.ftz}.f32     d, a, b;  // IEEE 754 compliant rounding\n\ndiv.rnd.f64           d, a, b;  // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nDivides a by b, stores result in d.\n\nSemantics\n\nd = a / b;\n\nNotes\n\nFast, a...\n\n=====Integer Arithmetic Instructions: div\n\n\n\nDivide one value by another.\n\nSyntax\n\ndiv.type  d, a, b;\n\n.type = { .u16, .u32, .u64,\n\n          .s16, .s32, .s64 };\n\nDescription\n\nDivides a by b, stores result in d.\n\nSemantics\n\nd = a / b;\n\nNotes\n\nDivision by zero yields an unspecified, machine-specific value.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\ndiv.s32  b,n,i;\n\n... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div"
            };

        case "dp2a":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a\" target=\"_blank\" rel=\"noopener noreferrer\">dp2a(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: dp2a</h1><section id=\"integer-arithmetic-instructions-dp2a\">\n\n\n<p>Two-way dot product-accumulate.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>dp2a.mode.atype.btype  d, a, b, c;\n\n.atype = .btype = { .u32, .s32 };\n.mode = { .lo, .hi };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit result.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> are 32-bit inputs. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> holds two 16-bits inputs in packed form and\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> holds 4 byte inputs in packed form for dot product.</p>\n<p>Depending on the <code class=\"docutils literal notranslate\"><span class=\"pre\">.mode</span></code> specified, either lower half or upper half of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> will be used\nfor dot product.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> if both <code class=\"docutils literal notranslate\"><span class=\"pre\">.atype</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.btype</span></code> are <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> else operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>\nhas type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = c;\n// Extract two 16-bit values from a 32-bit input and sign or zero extend\n// based on input type.\nVa = extractAndSignOrZeroExt_2(a, .atype);\n\n// Extract four 8-bit values from a 32-bit input and sign or zer extend\n// based on input type.\nVb = extractAndSignOrZeroExt_4(b, .btype);\n\nb_select = (.mode == .lo) ? 0 : 2;\n\nfor (i = 0; i &lt; 2; ++i) {\n    d += Va[i] * Vb[b_select + i];\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 5.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_61</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>dp2a.lo.u32.u32           d0, a0, b0, c0;\ndp2a.hi.u32.s32           d1, a1, b1, c1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Two-way dot product-accumulate.\n\nSyntax\n\ndp2a.mode.atype.btype  d, a, b, c;\n\n.atype = .btype = { .u32, .s32 };\n\n.mode = { .lo, .hi };\n\nDescription\n\nTwo-way 16-bit to 8-bit dot product which is accumulated in 32-bit result.\n\nOperand a and b are 32-bit inputs. Operand a holds two 16-bits inputs in packed form and\n\noperand b holds 4 byte inputs in packed form for dot product.\n\nDepending on the .mode specified, either lower half or upper half of operand b will be used\n\nfor dot product.\n\nOperand c has type .u32 if both .atype and .btype are .u32 else operand c\n\nhas type .s32.\n\nSemantics\n\nd = c;\n\n// Extract two 16-bit values from a 32-bit input and sign or zero extend\n\n// based on input type.\n\nVa = extractAndSignOrZeroExt_2(a, .atype);\n\n// Extract four 8-bit values from a 32-bit input and sign or zer extend\n\n// based on input type.\n\nVb = extractAndSignOrZeroExt_4(b, .btype);\n\nb_select = (.mode == .lo) ? 0 : 2;\n\nfor (i = 0; i < 2; ++i) {\n\n    d += Va[i] * Vb[b_select + i];\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\nRequires sm_61 or higher.\n\nExamples\n\ndp2a.lo.u32.u32           d0, a0, b0, c0;\n\ndp2a.hi.u32.s32           d1, a1, b1, c1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a"
            };

        case "dp4a":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a\" target=\"_blank\" rel=\"noopener noreferrer\">dp4a(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: dp4a</h1><section id=\"integer-arithmetic-instructions-dp4a\">\n\n\n<p>Four-way byte dot product-accumulate.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>dp4a.atype.btype  d, a, b, c;\n\n.atype = .btype = { .u32, .s32 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Four-way byte dot product which is accumulated in 32-bit result.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> are 32-bit inputs which hold 4 byte inputs in packed form for dot product.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> if both <code class=\"docutils literal notranslate\"><span class=\"pre\">.atype</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.btype</span></code> are <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> else operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>\nhas type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = c;\n\n// Extract 4 bytes from a 32bit input and sign or zero extend\n// based on input type.\nVa = extractAndSignOrZeroExt_4(a, .atype);\nVb = extractAndSignOrZeroExt_4(b, .btype);\n\nfor (i = 0; i &lt; 4; ++i) {\n    d += Va[i] * Vb[i];\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 5.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_61</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>dp4a.u32.u32           d0, a0, b0, c0;\ndp4a.u32.s32           d1, a1, b1, c1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Four-way byte dot product-accumulate.\n\nSyntax\n\ndp4a.atype.btype  d, a, b, c;\n\n.atype = .btype = { .u32, .s32 };\n\nDescription\n\nFour-way byte dot product which is accumulated in 32-bit result.\n\nOperand a and b are 32-bit inputs which hold 4 byte inputs in packed form for dot product.\n\nOperand c has type .u32 if both .atype and .btype are .u32 else operand c\n\nhas type .s32.\n\nSemantics\n\nd = c;\n\n// Extract 4 bytes from a 32bit input and sign or zero extend\n\n// based on input type.\n\nVa = extractAndSignOrZeroExt_4(a, .atype);\n\nVb = extractAndSignOrZeroExt_4(b, .btype);\n\nfor (i = 0; i < 4; ++i) {\n\n    d += Va[i] * Vb[i];\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\nRequires sm_61 or higher.\n\nExamples\n\ndp4a.u32.u32           d0, a0, b0, c0;\n\ndp4a.u32.s32           d1, a1, b1, c1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a"
            };

        case "dynamic_smem_size":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-dynamic-smem-size\" target=\"_blank\" rel=\"noopener noreferrer\">dynamic_smem_size <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %dynamic_smem_size</h1><section id=\"special-registers-dynamic-smem-size\">\n\n\n<p>Size of shared memory allocated dynamically at kernel launch.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %dynamic_smem_size;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Size of shared memory allocated dynamically at kernel launch.</p>\n<p>A predefined, read-only special register initialized with size of shared memory allocated dynamically for the CTA of a kernel at launch time.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32  %r, %dynamic_smem_size;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Size of shared memory allocated dynamically at kernel launch.\n\nSyntax (predefined)\n\n.sreg .u32 %dynamic_smem_size;\n\nDescription\n\nSize of shared memory allocated dynamically at kernel launch.\n\nA predefined, read-only special register initialized with size of shared memory allocated dynamically for the CTA of a kernel at launch time.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 4.1.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\nmov.u32  %r, %dynamic_smem_size;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-dynamic-smem-size"
            };

        case "elect":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync\" target=\"_blank\" rel=\"noopener noreferrer\">elect.sync <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: elect.sync</h1><section id=\"parallel-synchronization-and-communication-instructions-elect-sync\">\n\n\n<p>Elect a leader thread from a set of threads.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>elect.sync d|p, membermask;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">elect.sync</span></code> elects one predicated active leader thread from among a set of threads specified by\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>. <code class=\"docutils literal notranslate\"><span class=\"pre\">laneid</span></code> of the elected thread is returned in the 32-bit destination operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. The sink symbol \u2018_\u2019 can be used for destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. The predicate destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> for the leader thread, and <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code> for all other threads.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> specifies a 32-bit integer indicating the set of threads from which a leader\nis to be elected. The behavior is undefined if the executing thread is not in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>.</p>\n<p>Election of a leader thread happens deterministically, i.e. the same leader thread is elected for\nthe same <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> every time.</p>\n<p>The mandatory <code class=\"docutils literal notranslate\"><span class=\"pre\">.sync</span></code> qualifier indicates that <code class=\"docutils literal notranslate\"><span class=\"pre\">elect</span></code> causes the executing thread to wait until\nall threads in the <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> execute the <code class=\"docutils literal notranslate\"><span class=\"pre\">elect</span></code> instruction before resuming execution.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>elect.sync    %r0|%p0, 0xffffffff;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Elect a leader thread from a set of threads.\n\nSyntax\n\nelect.sync d|p, membermask;\n\nDescription\n\nelect.sync elects one predicated active leader thread from among a set of threads specified by\n\nmembermask. laneid of the elected thread is returned in the 32-bit destination operand\n\nd. The sink symbol \u2018_\u2019 can be used for destination operand d. The predicate destination\n\np is set to True for the leader thread, and False for all other threads.\n\nOperand membermask specifies a 32-bit integer indicating the set of threads from which a leader\n\nis to be elected. The behavior is undefined if the executing thread is not in membermask.\n\nElection of a leader thread happens deterministically, i.e. the same leader thread is elected for\n\nthe same membermask every time.\n\nThe mandatory .sync qualifier indicates that elect causes the executing thread to wait until\n\nall threads in the membermask execute the elect instruction before resuming execution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\nelect.sync    %r0|%p0, 0xffffffff;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync"
            };

        case "entry":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-entry\" target=\"_blank\" rel=\"noopener noreferrer\">entry <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Kernel and Function Directives: .entry</h1><section id=\"kernel-and-function-directives-entry\">\n\n\n<p>Kernel entry point and body, with optional parameters.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.entry kernel-name ( param-list )  kernel-body\n.entry kernel-name  kernel-body\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Defines a kernel entry point name, parameters, and body for the kernel function.</p>\n<p>Parameters are passed via <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space memory and are listed within an optional parenthesized\nparameter list. Parameters may be referenced by name within the kernel body and loaded into\nregisters using <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.param{::entry}</span></code> instructions.</p>\n<p>In addition to normal parameters, opaque <code class=\"docutils literal notranslate\"><span class=\"pre\">.texref</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.samplerref</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">.surfref</span></code> variables\nmay be passed as parameters. These parameters can only be referenced by name within texture and\nsurface load, store, and query instructions and cannot be accessed via <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.param</span></code> instructions.</p>\n<p>The shape and size of the CTA executing the kernel are available in special registers.</p>\n<p><strong>Semantics</strong></p>\n<p>Specify the entry point for a kernel program.</p>\n<p>At kernel launch, the kernel dimensions and properties are established and made available via\nspecial registers, e.g., <code class=\"docutils literal notranslate\"><span class=\"pre\">%ntid</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">%nctaid</span></code>, etc.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>For PTX ISA version 1.4 and later, parameter variables are declared in the kernel parameter\nlist. For PTX ISA versions 1.0 through 1.3, parameter variables are declared in the kernel body.</p>\n<p>The maximum memory size supported by PTX for normal (non-opaque type) parameters is 32764\nbytes. Depending upon the PTX ISA version, the parameter size limit varies. The following table\nshows the allowed parameter size for a PTX ISA version:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 48%\"/>\n<col style=\"width: 52%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>PTX ISA Version</p></th>\n<th class=\"head\"><p>Maximum parameter size (In bytes)</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p>PTX ISA version 8.1 and above</p></td>\n<td><p>32764</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>PTX ISA version 1.5 and above</p></td>\n<td><p>4352</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>PTX ISA version 1.4 and above</p></td>\n<td><p>256</p></td>\n</tr>\n</tbody>\n</table>\n<p>The CUDA and OpenCL drivers support the following limits for parameter memory:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 13%\"/>\n<col style=\"width: 88%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>Driver</p></th>\n<th class=\"head\"><p>Parameter memory size</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p>CUDA</p></td>\n<td><p>256 bytes for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code>, 4096 bytes for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_2x</span> <span class=\"pre\">and</span> <span class=\"pre\">higher</span></code>,\n32764 bytes fo <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> and higher</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>OpenCL</p></td>\n<td><p>32764 bytes for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> and higher, 4352 bytes on <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code>\nand lower</p></td>\n</tr>\n</tbody>\n</table>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.entry cta_fft\n.entry filter ( .param .b32 x, .param .b32 y, .param .b32 z )\n{\n    .reg .b32 %r&lt;99&gt;;\n    ld.param.b32  %r1, [x];\n    ld.param.b32  %r2, [y];\n    ld.param.b32  %r3, [z];\n    ...\n}\n\n.entry prefix_sum ( .param .align 4 .s32 pitch[8000] )\n{\n    .reg .s32 %t;\n    ld.param::entry.s32  %t, [pitch];\n    ...\n}\n</pre></div>\n</div>\n</section>",
                "tooltip": "Kernel entry point and body, with optional parameters.\n\nSyntax\n\n.entry kernel-name ( param-list )  kernel-body\n\n.entry kernel-name  kernel-body\n\nDescription\n\nDefines a kernel entry point name, parameters, and body for the kernel function.\n\nParameters are passed via .param space memory and are listed within an optional parenthesized\n\nparameter list. Parameters may be referenced by name within the kernel body and loaded into\n\nregisters using ld.param{::entry} instructions.\n\nIn addition to normal parameters, opaque .texref, .samplerref, and .surfref variables\n\nmay be passed as parameters. These parameters can only be referenced by name within texture and\n\nsurface load, store, and query instructions and cannot be accessed via ld.param instructions.\n\nThe shape and size of the CTA executing the kernel are available in special registers.\n\nSemantics\n\nSpecify the entry point for a kernel program.\n\nAt kernel launch, the kernel dimensions and properties are established and made available via\n\nspecial registers, e.g., %ntid, %nctaid, etc.\n\nPTX ISA Notes\n\nFor PTX ISA version 1.4 and later, parameter variables are declared in the kernel parameter\n\nlist. For PTX ISA versions 1.0 through 1.3, parameter variables are declared in the kernel body.\n\nThe maximum memory size supported by PTX for normal (non-opaque type) parameters is 32764\n\nbytes. Depending upon the PTX ISA version, the parameter size limit varies. The following table\n\nshows the allowed parameter size for a PTX ISA version:\n\n\n\nPTX ISA Version\n\nMaximum parameter size (In bytes)\n\n\n\nPTX ISA version 8.1 and above\n\n32764\n\nPTX ISA version 1.5 and above\n\n4352\n\nPTX ISA version 1.4 and above\n\n256\n\nThe CUDA and OpenCL drivers support the following limits for parameter memory:\n\n\n\nDriver\n\nParameter memory size\n\n\n\nCUDA\n\n256 bytes for sm_1x, 4096 bytes for sm_2x and higher,\n\n32764 bytes fo sm_70 and higher\n\nOpenCL\n\n32764 bytes for sm_70 and higher, 4352 bytes on sm_6x\n\nand lower\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry cta_fft\n\n.entry filter ( .param .b32 x, .param .b32 y, .param .b32 z )\n\n{\n\n    .reg .b32 %r<99>;\n\n    ld.param.b32  %r1, [x];\n\n    ld.param.b32  %r2, [y];\n\n    ld.param.b32  %r3, [z];\n\n    ...\n\n}\n\n.entry prefix_sum ( .param .align 4 .s32 pitch[8000] )\n\n{\n\n    .reg .s32 %t;\n\n    ld.param::entry.s32  %t, [pitch];\n\n    ...\n\n}\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-entry"
            };

        case "envreg<32>":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-envreg-32\" target=\"_blank\" rel=\"noopener noreferrer\">envreg<32> <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %envreg<32></h1><section id=\"special-registers-envreg-32\">\n\n\n<p>Driver-defined read-only registers.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .b32 %envreg&lt;32&gt;;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A set of 32 pre-defined read-only registers used to capture execution environment of PTX program\noutside of PTX virtual machine. These registers are initialized by the driver prior to kernel launch\nand can contain cta-wide or grid-wide values.</p>\n<p>Precise semantics of these registers is defined in the driver documentation.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.b32      %r1,%envreg0;  // move envreg0 to %r1\n</pre></div>\n</div>\n</section>",
                "tooltip": "Driver-defined read-only registers.\n\nSyntax (predefined)\n\n.sreg .b32 %envreg<32>;\n\nDescription\n\nA set of 32 pre-defined read-only registers used to capture execution environment of PTX program\n\noutside of PTX virtual machine. These registers are initialized by the driver prior to kernel launch\n\nand can contain cta-wide or grid-wide values.\n\nPrecise semantics of these registers is defined in the driver documentation.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.b32      %r1,%envreg0;  // move envreg0 to %r1\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-envreg-32"
            };

        case "ex2":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2\" target=\"_blank\" rel=\"noopener noreferrer\">ex2(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2\" target=\"_blank\" rel=\"noopener noreferrer\">ex2(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: ex2</h1><section id=\"floating-point-instructions-ex2\">\n\n\n<p>Find the base-2 exponential of a value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>ex2.approx{.ftz}.f32  d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Raise 2 to the power <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = 2 ^ a;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.f32</span></code> implements a fast approximation to 2<sup>a</sup>.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 60%\"/>\n<col style=\"width: 40%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p>-Inf</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>-subnormal</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>-0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>+0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>+subnormal</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>+Inf</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>The maximum absolute error is 2<sup>-22.5</sup> for fraction in the primary range.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p>Subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.f32</span></code> introduced in PTX ISA version 1.0. Explicit modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code>\nintroduced in PTX ISA version 1.4.</p>\n<p>For PTX ISA version 1.4 and later, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> modifier is required.</p>\n<p>For PTX ISA versions 1.0 through 1.3, <code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.f32</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.ftz.f32</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>ex2.approx.ftz.f32  xa, a;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: ex2</h1><section id=\"half-precision-floating-point-instructions-ex2\">\n\n\n<p>Find the base-2 exponent of input.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>ex2.approx.atype     d, a;\nex2.approx.ftz.btype d, a;\n\n.atype = { .f16,  .f16x2}\n.btype = { .bf16, .bf16x2}\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Raise 2 to the power <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>The type of operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> are as specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.type</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, each of the half-word operands are operated in\nparallel and the results are packed appropriately into a <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (.type == .f16 || .type == .bf16) {\n  d = 2 ^ a\n} else if (.type == .f16x2 || .type == .bf16x2) {\n  fA[0] = a[0:15];\n  fA[1] = a[16:31];\n  d[0] = 2 ^ fA[0]\n  d[1] = 2 ^ fA[1]\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.{f16,</span> <span class=\"pre\">f16x2,</span> <span class=\"pre\">bf16,</span> <span class=\"pre\">bf16x2}</span></code> implement a fast approximation to 2<sup>a</sup>.</p>\n<p>For the <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> type, subnormal inputs are supported. <code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.ftz.bf16</span></code> flushes subnormal\ninputs and results to sign-preserving zero.</p>\n<p>Results of <code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.ftz.bf16</span></code> for various corner-case inputs are as follows:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 60%\"/>\n<col style=\"width: 40%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p>-Inf</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>-subnormal</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>-0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>+0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>+subnormal</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>+Inf</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>Results of <code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.f16</span></code> for various corner-case inputs are as follows:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 47%\"/>\n<col style=\"width: 53%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p>-Inf</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>-0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>+0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>+Inf</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>The maximum relative error for <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> type is 2-9.9. The maximum relative error for <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> type\nis 2-7.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.ftz.{bf16/bf16x2}</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_75</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.ftz.{bf16/bf16x2}</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>ex2.approx.f16         h1, h0;\nex2.approx.f16x2       hd1, hd0;\nex2.approx.ftz.bf16    b1, b2;\nex2.approx.ftz.bf16x2  hb1, hb2;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: ex2\n\n\n\nFind the base-2 exponential of a value.\n\nSyntax\n\nex2.approx{.ftz}.f32  d, a;\n\nDescription\n\nRaise 2 to the power a.\n\nSemantics\n\nd = 2 ^ a;\n\nNotes\n\nex2.approx.f32 implements a fast approximation to 2a.\n\n\n\nInput\n\nResult\n\n\n\n-Inf\n\n+0.0\n\n-subnormal\n\n+1.0\n\n-0.0\n\n+1.0\n\n+0.0\n\n+1.0\n\n+subnormal\n\n+1.0\n\n+Inf\n\n+Inf\n\nNaN\n\nNaN\n\nThe maximum absolute error is 2-22.5 for fraction in the primary range.\n\nSubnormal numbers:\n\nsm_20+\n\nBy default, sub...\n\n=====Half Precision Floating Point Instructions: ex2\n\n\n\nFind the base-2 exponent of input.\n\nSyntax\n\nex2.approx.atype     d, a;\n\nex2.approx.ftz.btype d, a;\n\n.atype = { .f16,  .f16x2}\n\n.btype = { .bf16, .bf16x2}\n\nDescription\n\nRaise 2 to the power a.\n\nThe type of operands d and a are as specified by .type.\n\nFor .f16x2 or .bf16x2 instruction type, each of the half-word operands are operated in\n\nparallel and the results are packed appropriately into a .f16x2 or .bf16... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2"
            };

        case "exit":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit\" target=\"_blank\" rel=\"noopener noreferrer\">exit <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Instructions: exit</h1><section id=\"control-flow-instructions-exit\">\n\n\n<p>Terminate a thread.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>exit;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Ends execution of a thread.</p>\n<p>As threads exit, barriers waiting on all threads are checked to see if the exiting threads are the\nonly threads that have not yet made it to a barrier{.cta} for all threads in the CTA or to a\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster</span></code> for all threads in the cluster. If the exiting threads are holding up the\nbarrier, the barrier is released.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>    exit;\n@p  exit;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Terminate a thread.\n\nSyntax\n\nexit;\n\nDescription\n\nEnds execution of a thread.\n\nAs threads exit, barriers waiting on all threads are checked to see if the exiting threads are the\n\nonly threads that have not yet made it to a barrier{.cta} for all threads in the CTA or to a\n\nbarrier.cluster for all threads in the cluster. If the exiting threads are holding up the\n\nbarrier, the barrier is released.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n    exit;\n\n@p  exit;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit"
            };

        case "explicitcluster":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-explicitcluster\" target=\"_blank\" rel=\"noopener noreferrer\">explicitcluster <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Cluster Dimension Directives: .explicitcluster</h1><section id=\"cluster-dimension-directives-explicitcluster\">\n\n\n<p>Declare that Kernel must be launched with cluster dimensions explicitly specified.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.explicitcluster\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares that this Kernel should be launched with cluster dimension explicitly specified.</p>\n<p><strong>Semantics</strong></p>\n<p>Kernels with <code class=\"docutils literal notranslate\"><span class=\"pre\">.explicitcluster</span></code> directive must be launched with cluster dimension explicitly\nspecified (either at launch time or via <code class=\"docutils literal notranslate\"><span class=\"pre\">.reqnctapercluster</span></code>), otherwise program will fail with\nruntime error or kernel launch failure.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.entry foo .explicitcluster         { . . . }\n</pre></div>\n</div>\n</section>",
                "tooltip": "Declare that Kernel must be launched with cluster dimensions explicitly specified.\n\nSyntax\n\n.explicitcluster\n\nDescription\n\nDeclares that this Kernel should be launched with cluster dimension explicitly specified.\n\nSemantics\n\nKernels with .explicitcluster directive must be launched with cluster dimension explicitly\n\nspecified (either at launch time or via .reqnctapercluster), otherwise program will fail with\n\nruntime error or kernel launch failure.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.entry foo .explicitcluster         { . . . }\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-explicitcluster"
            };

        case "extern":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-extern\" target=\"_blank\" rel=\"noopener noreferrer\">extern <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Linking Directives: .extern</h1><section id=\"linking-directives-extern\">\n\n\n<p>External symbol declaration.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.extern identifier\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares identifier to be defined external to the current module. The module defining such\nidentifier must define it as <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.visible</span></code> only once in a single object file. Extern\ndeclaration of symbol may appear multiple times and references to that get resolved against the\nsingle definition of that symbol.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.extern .global .b32 foo;  // foo is defined in another module\n</pre></div>\n</div>\n</section>",
                "tooltip": "External symbol declaration.\n\nSyntax\n\n.extern identifier\n\nDescription\n\nDeclares identifier to be defined external to the current module. The module defining such\n\nidentifier must define it as .weak or .visible only once in a single object file. Extern\n\ndeclaration of symbol may appear multiple times and references to that get resolved against the\n\nsingle definition of that symbol.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.extern .global .b32 foo;  // foo is defined in another module\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-extern"
            };

        case "file":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-file\" target=\"_blank\" rel=\"noopener noreferrer\">file <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Debugging Directives: .file</h1><section id=\"debugging-directives-file\">\n\n\n<p>Source file name.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.file file_index \"filename\" {, timestamp, file_size}\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Associates a source filename with an integer index. <code class=\"docutils literal notranslate\"><span class=\"pre\">.loc</span></code> directives reference source files by\nindex.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.file</span></code> directive allows optionally specifying an unsigned number representing time of last\nmodification and an unsigned integer representing size in bytes of source file. <code class=\"docutils literal notranslate\"><span class=\"pre\">timestamp</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">file_size</span></code> value can be 0 to indicate this information is not available.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">timestamp</span></code> value is in format of C and C++ data type <code class=\"docutils literal notranslate\"><span class=\"pre\">time_t</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">file_size</span></code> is an unsigned 64-bit integer.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.file</span></code> directive is allowed only in the outermost scope, i.e., at the same level as kernel\nand device function declarations.</p>\n<p><strong>Semantics</strong></p>\n<p>If timestamp and file size are not specified, they default to 0.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p>Timestamp and file size introduced in PTX ISA version 3.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.file 1 \"example.cu\"\n.file 2 \"kernel.cu\"\n.file 1 \u201ckernel.cu\u201d, 1339013327, 64118\n</pre></div>\n</div>\n</section>",
                "tooltip": "Source file name.\n\nSyntax\n\n.file file_index \"filename\" {, timestamp, file_size}\n\nDescription\n\nAssociates a source filename with an integer index. .loc directives reference source files by\n\nindex.\n\n.file directive allows optionally specifying an unsigned number representing time of last\n\nmodification and an unsigned integer representing size in bytes of source file. timestamp and\n\nfile_size value can be 0 to indicate this information is not available.\n\ntimestamp value is in format of C and C++ data type time_t.\n\nfile_size is an unsigned 64-bit integer.\n\nThe .file directive is allowed only in the outermost scope, i.e., at the same level as kernel\n\nand device function declarations.\n\nSemantics\n\nIf timestamp and file size are not specified, they default to 0.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTimestamp and file size introduced in PTX ISA version 3.2.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.file 1 \"example.cu\"\n\n.file 2 \"kernel.cu\"\n\n.file 1 \u201ckernel.cu\u201d, 1339013327, 64118\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-file"
            };

        case "fma":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma\" target=\"_blank\" rel=\"noopener noreferrer\">fma(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma\" target=\"_blank\" rel=\"noopener noreferrer\">fma(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: fma</h1><section id=\"floating-point-instructions-fma\">\n\n\n<p>Fused multiply-add.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>fma.rnd{.ftz}{.sat}.f32  d, a, b, c;\nfma.rnd.f64              d, a, b, c;\n\n.rnd = { .rn, .rz, .rm, .rp };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs a fused multiply-add with no loss of precision in the intermediate product and addition.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a*b + c;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f32</span></code> computes the product of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to infinite precision and then adds <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to\nthis product, again in infinite precision. The resulting value is then rounded to single precision\nusing the rounding mode specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f64</span></code> computes the product of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to infinite precision and then adds <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to\nthis product, again in infinite precision. The resulting value is then rounded to double precision\nusing the rounding mode specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f64</span></code> is the same as <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code>.</p>\n<p>Rounding modifiers (no default):</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt>\n<dd>\n<p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f32</span></code> is unimplemented for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code> targets.</p>\n</dd>\n</dl>\n<p>Saturation:</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.sat.f32</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f64</span></code> introduced in PTX ISA version 1.4.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f32</span></code> introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f32</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>    fma.rn.ftz.f32  w,x,y,z;\n@p  fma.rn.f64      d,a,b,c;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: fma</h1><section id=\"half-precision-floating-point-instructions-fma\">\n\n\n<p>Fused multiply-add</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>fma.rnd{.ftz}{.sat}.f16     d, a, b, c;\nfma.rnd{.ftz}{.sat}.f16x2   d, a, b, c;\nfma.rnd{.ftz}.relu.f16      d, a, b, c;\nfma.rnd{.ftz}.relu.f16x2    d, a, b, c;\nfma.rnd{.relu}.bf16         d, a, b, c;\nfma.rnd{.relu}.bf16x2       d, a, b, c;\nfma.rnd.oob.{relu}.type     d, a, b, c;\n\n.rnd = { .rn };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs a fused multiply-add with no loss of precision in the intermediate product and addition.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, forms input vectors by half word values from source\noperands. Half-word operands are then operated in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>\nresult in destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code>\ntype. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>\ntype. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    d = a * b + c;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    fC[0] = c[0:15];\n    fC[1] = c[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = fA[i] * fB[i] + fC[i];\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Rounding modifiers (default is <code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>):</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt>\n<dd>\n<p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt>Subnormal numbers:</dt>\n<dd>\n<p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">fma.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt>Saturation modifier:</dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.sat.{f16,</span> <span class=\"pre\">f16x2}</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">fma.relu.{f16,</span> <span class=\"pre\">f16x2,</span> <span class=\"pre\">bf16,</span> <span class=\"pre\">bf16x2}</span></code> clamps the result to 0 if negative. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> result is\nconverted to canonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n</dd>\n<dt>Out Of Bounds modifier:</dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.oob.{f16,</span> <span class=\"pre\">f16x2,</span> <span class=\"pre\">bf16,</span> <span class=\"pre\">bf16x2}</span></code> clamps the result to 0 if either of the operands\nis <code class=\"docutils literal notranslate\"><span class=\"pre\">OOB</span> <span class=\"pre\">NaN</span></code> (defined under <a class=\"reference external\" href=\"#tensors\">Tensors</a>) value. The test for the special <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> value\nand resultant forcing of the result to +0.0 is performed independently for each of the\ntwo SIMD operations.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.relu.{f16,</span> <span class=\"pre\">f16x2}</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">fma{.relu}.{bf16,</span> <span class=\"pre\">bf16x2}</span></code> introduced in PTX ISA version 7.0.</p>\n<p>Support for modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.oob</span></code> introduced in PTX ISA version 8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.relu.{f16,</span> <span class=\"pre\">f16x2}</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">fma{.relu}.{bf16,</span> <span class=\"pre\">bf16x2}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma{.oob}.{f16,</span> <span class=\"pre\">f16x2,</span> <span class=\"pre\">bf16,</span> <span class=\"pre\">bf16x2}</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// scalar f16 fused multiply-add\nfma.rn.f16         d0, a0, b0, c0;\nfma.rn.f16         d1, a1, b1, c1;\nfma.rn.relu.f16    d1, a1, b1, c1;\nfma.rn.oob.f16      d1, a1, b1, c1;\nfma.rn.oob.relu.f16 d1, a1, b1, c1;\n\n// scalar bf16 fused multiply-add\nfma.rn.bf16        d1, a1, b1, c1;\nfma.rn.relu.bf16   d1, a1, b1, c1;\nfma.rn.oob.bf16       d1, a1, b1, c1;\nfma.rn.oob.relu.bf16  d1, a1, b1, c1;\n\n// SIMD f16 fused multiply-add\ncvt.rn.f16.f32 h0, f0;\ncvt.rn.f16.f32 h1, f1;\ncvt.rn.f16.f32 h2, f2;\ncvt.rn.f16.f32 h3, f3;\nmov.b32  p1, {h0, h1}; // pack two f16 to 32bit f16x2\nmov.b32  p2, {h2, h3}; // pack two f16 to 32bit f16x2\nfma.rn.f16x2  p3, p1, p2, p2;   // SIMD f16x2 fused multiply-add\nfma.rn.relu.f16x2  p3, p1, p2, p2; // SIMD f16x2 fused multiply-add with relu saturation mode\nfma.rn.oob.f16x2  p3, p1, p2, p2; // SIMD f16x2 fused multiply-add with oob modifier\nfma.rn.oob.relu.f16x2 p3, p1, p2, p2; // SIMD f16x2 fused multiply-add with oob modifier and relu saturation mode\n\n// SIMD fp16 fused multiply-add\nld.global.b32   f0, [addr];     // load 32 bit which hold packed f16x2\nld.global.b32   f1, [addr + 4]; // load 32 bit which hold packed f16x2\nfma.rn.f16x2    f2, f0, f1, f1; // SIMD f16x2 fused multiply-add\n\n// SIMD bf16 fused multiply-add\nfma.rn.bf16x2       f2, f0, f1, f1; // SIMD bf16x2 fused multiply-add\nfma.rn.relu.bf16x2  f2, f0, f1, f1; // SIMD bf16x2 fused multiply-add with relu saturation mode\nfma.rn.oob.bf16x2  f2, f0, f1, f1; // SIMD bf16x2 fused multiply-add with oob modifier\nfma.rn.oob.relu.bf16x2  f2, f0, f1, f1; // SIMD bf16x2 fused multiply-add with oob modifier and relu saturation mode\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: fma\n\n\n\nFused multiply-add.\n\nSyntax\n\nfma.rnd{.ftz}{.sat}.f32  d, a, b, c;\n\nfma.rnd.f64              d, a, b, c;\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nPerforms a fused multiply-add with no loss of precision in the intermediate product and addition.\n\nSemantics\n\nd = a*b + c;\n\nNotes\n\nfma.f32 computes the product of a and b to infinite precision and then adds c to\n\nthis product, again in infinite precision. The r...\n\n=====Half Precision Floating Point Instructions: fma\n\n\n\nFused multiply-add\n\nSyntax\n\nfma.rnd{.ftz}{.sat}.f16     d, a, b, c;\n\nfma.rnd{.ftz}{.sat}.f16x2   d, a, b, c;\n\nfma.rnd{.ftz}.relu.f16      d, a, b, c;\n\nfma.rnd{.ftz}.relu.f16x2    d, a, b, c;\n\nfma.rnd{.relu}.bf16         d, a, b, c;\n\nfma.rnd{.relu}.bf16x2       d, a, b, c;\n\nfma.rnd.oob.{relu}.type     d, a, b, c;\n\n.rnd = { .rn };\n\nDescription\n\nPerforms a fused multiply-add with no loss of precision in the int... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma"
            };

        case "fns":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns\" target=\"_blank\" rel=\"noopener noreferrer\">fns(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: fns</h1><section id=\"integer-arithmetic-instructions-fns\">\n\n\n<p>Find the n-th set bit</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>fns.b32 d, mask, base, offset;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Given a 32-bit value <code class=\"docutils literal notranslate\"><span class=\"pre\">mask</span></code> and an integer value <code class=\"docutils literal notranslate\"><span class=\"pre\">base</span></code> (between 0 and 31), find the n-th (given\nby offset) set bit in <code class=\"docutils literal notranslate\"><span class=\"pre\">mask</span></code> from the <code class=\"docutils literal notranslate\"><span class=\"pre\">base</span></code> bit, and store the bit position in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. If not\nfound, store 0xffffffff in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">mask</span></code> has a 32-bit type. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">base</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>\ntype. Operand offset has <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> type. Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32.</span></code></p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">base</span></code> must be &lt;= 31, otherwise behavior is undefined.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = 0xffffffff;\nif (offset == 0) {\n    if (mask[base] == 1) {\n        d = base;\n    }\n} else {\n    pos = base;\n    count = |offset| - 1;\n    inc = (offset &gt; 0) ? 1 : -1;\n\n    while ((pos &gt;= 0) &amp;&amp; (pos &lt; 32)) {\n        if (mask[pos] == 1) {\n            if (count == 0) {\n              d = pos;\n              break;\n           } else {\n               count = count \u2013 1;\n           }\n        }\n        pos = pos + inc;\n    }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fns</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>fns.b32 d, 0xaaaaaaaa, 3, 1;   // d = 3\nfns.b32 d, 0xaaaaaaaa, 3, -1;  // d = 3\nfns.b32 d, 0xaaaaaaaa, 2, 1;   // d = 3\nfns.b32 d, 0xaaaaaaaa, 2, -1;  // d = 1\n</pre></div>\n</div>\n</section>",
                "tooltip": "Find the n-th set bit\n\nSyntax\n\nfns.b32 d, mask, base, offset;\n\nDescription\n\nGiven a 32-bit value mask and an integer value base (between 0 and 31), find the n-th (given\n\nby offset) set bit in mask from the base bit, and store the bit position in d. If not\n\nfound, store 0xffffffff in d.\n\nOperand mask has a 32-bit type. Operand base has .b32, .u32 or .s32\n\ntype. Operand offset has .s32 type. Destination d has type .b32.\n\nOperand base must be <= 31, otherwise behavior is undefined.\n\nSemantics\n\nd = 0xffffffff;\n\nif (offset == 0) {\n\n    if (mask[base] == 1) {\n\n        d = base;\n\n    }\n\n} else {\n\n    pos = base;\n\n    count = |offset| - 1;\n\n    inc = (offset > 0) ? 1 : -1;\n\n    while ((pos >= 0) && (pos < 32)) {\n\n        if (mask[pos] == 1) {\n\n            if (count == 0) {\n\n              d = pos;\n\n              break;\n\n           } else {\n\n               count = count \u2013 1;\n\n           }\n\n        }\n\n        pos = pos + inc;\n\n    }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.0.\n\nTarget ISA Notes\n\nfns requires sm_30 or higher.\n\nExamples\n\nfns.b32 d, 0xaaaaaaaa, 3, 1;   // d = 3\n\nfns.b32 d, 0xaaaaaaaa, 3, -1;  // d = 3\n\nfns.b32 d, 0xaaaaaaaa, 2, 1;   // d = 3\n\nfns.b32 d, 0xaaaaaaaa, 2, -1;  // d = 1\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns"
            };

        case "func":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-func\" target=\"_blank\" rel=\"noopener noreferrer\">func <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Kernel and Function Directives: .func</h1><section id=\"kernel-and-function-directives-func\">\n\n\n<p>Function definition.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.func {.attribute(attr-list)} fname {.noreturn} function-body\n.func {.attribute(attr-list)} fname (param-list) {.noreturn} function-body\n.func {.attribute(attr-list)} (ret-param) fname (param-list) function-body\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Defines a function, including input and return parameters and optional function body.</p>\n<p>An optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive indicates that the function does not return to the caller\nfunction. <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive cannot be specified on functions which have return parameters. See\nthe description of <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive in <a class=\"reference external\" href=\"#performance-tuning-directives-noreturn\">Performance-Tuning Directives: .noreturn</a>.</p>\n<p>An optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.attribute</span></code> directive specifies additional information associated with the\nfunction. See the description of <a class=\"reference external\" href=\"#variable-and-function-attribute-directive-attribute\">Variable and Function Attribute Directive: .attribute</a> for allowed attributes.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">.func</span></code> definition with no body provides a function prototype.</p>\n<p>The parameter lists define locally-scoped variables in the function body. Parameters must be base\ntypes in either the register or parameter state space. Parameters in register state space may be\nreferenced directly within instructions in the function body. Parameters in <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space are\naccessed using <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.param{::func}</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">st.param{::func}</span></code> instructions in the body. Parameter\npassing is call-by-value.</p>\n<p>The last parameter in the parameter list may be a <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> array of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b8</span></code> with no size\nspecified. It is used to pass an arbitrary number of parameters to the function packed into a single\narray object.</p>\n<p>When calling a function with such an unsized last argument, the last argument may be omitted from\nthe <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> instruction if no parameter is passed through it. Accesses to this array parameter must\nbe within the bounds of the array. The result of an access is undefined if no array was passed, or\nif the access was outside the bounds of the actual array being passed.</p>\n<p><strong>Semantics</strong></p>\n<p>The PTX syntax hides all details of the underlying calling convention and ABI.</p>\n<p>The implementation of parameter passing is left to the optimizing translator, which may use a\ncombination of registers and stack locations to pass parameters.</p>\n<p><strong>Release Notes</strong></p>\n<p>For PTX ISA version 1.x code, parameters must be in the register state space, there is no stack, and\nrecursion is illegal.</p>\n<p>PTX ISA versions 2.0 and later with target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher allow parameters in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>\nstate space, implements an ABI with stack, and supports recursion.</p>\n<p>PTX ISA versions 2.0 and later with target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher support at most one return value.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p>Support for unsized array parameter introduced in PTX ISA version 6.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive introduced in PTX ISA version 6.4.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.attribute</span></code> directive introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Functions without unsized array parameter supported on all target architectures.</p>\n<p>Unsized array parameter requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.attribute</span></code> directive requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.func (.reg .b32 rval) foo (.reg .b32 N, .reg .f64 dbl)\n{\n.reg .b32 localVar;\n\n... use N, dbl;\nother code;\n\nmov.b32 rval,result;\nret;\n}\n\n...\ncall (fooval), foo, (val0, val1);  // return value in fooval\n...\n\n.func foo (.reg .b32 N, .reg .f64 dbl) .noreturn\n{\n.reg .b32 localVar;\n... use N, dbl;\nother code;\nmov.b32 rval, result;\nret;\n}\n...\ncall foo, (val0, val1);\n...\n\n.func (.param .u32 rval) bar(.param .u32 N, .param .align 4 .b8 numbers[])\n{\n    .reg .b32 input0, input1;\n    ld.param.b32   input0, [numbers + 0];\n    ld.param.b32   input1, [numbers + 4];\n    ...\n    other code;\n    ret;\n}\n...\n\n.param .u32 N;\n.param .align 4 .b8 numbers[8];\nst.param.u32    [N], 2;\nst.param.b32    [numbers + 0], 5;\nst.param.b32    [numbers + 4], 10;\ncall (rval), bar, (N, numbers);\n...\n</pre></div>\n</div>\n</section>",
                "tooltip": "Function definition.\n\nSyntax\n\n.func {.attribute(attr-list)} fname {.noreturn} function-body\n\n.func {.attribute(attr-list)} fname (param-list) {.noreturn} function-body\n\n.func {.attribute(attr-list)} (ret-param) fname (param-list) function-body\n\nDescription\n\nDefines a function, including input and return parameters and optional function body.\n\nAn optional .noreturn directive indicates that the function does not return to the caller\n\nfunction. .noreturn directive cannot be specified on functions which have return parameters. See\n\nthe description of .noreturn directive in Performance-Tuning Directives: .noreturn.\n\nAn optional .attribute directive specifies additional information associated with the\n\nfunction. See the description of Variable and Function Attribute Directive: .attribute for allowed attributes.\n\nA .func definition with no body provides a function prototype.\n\nThe parameter lists define locally-scoped variables in the function body. Parameters must be base\n\ntypes in either the register or parameter state space. Parameters in register state space may be\n\nreferenced directly within instructions in the function body. Parameters in .param space are\n\naccessed using ld.param{::func} and st.param{::func} instructions in the body. Parameter\n\npassing is call-by-value.\n\nThe last parameter in the parameter list may be a .param array of type .b8 with no size\n\nspecified. It is used to pass an arbitrary number of parameters to the function packed into a single\n\narray object.\n\nWhen calling a function with such an unsized last argument, the last argument may be omitted from\n\nthe call instruction if no parameter is passed through it. Accesses to this array parameter must\n\nbe within the bounds of the array. The result of an access is undefined if no array was passed, or\n\nif the access was outside the bounds of the actual array being passed.\n\nSemantics\n\nThe PTX syntax hides all details of the underlying calling convention and ABI.\n\nThe implementation of parameter passing is left to the optimizing translator, which may use a\n\ncombination of registers and stack locations to pass parameters.\n\nRelease Notes\n\nFor PTX ISA version 1.x code, parameters must be in the register state space, there is no stack, and\n\nrecursion is illegal.\n\nPTX ISA versions 2.0 and later with target sm_20 or higher allow parameters in the .param\n\nstate space, implements an ABI with stack, and supports recursion.\n\nPTX ISA versions 2.0 and later with target sm_20 or higher support at most one return value.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nSupport for unsized array parameter introduced in PTX ISA version 6.0.\n\nSupport for .noreturn directive introduced in PTX ISA version 6.4.\n\nSupport for .attribute directive introduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nFunctions without unsized array parameter supported on all target architectures.\n\nUnsized array parameter requires sm_30 or higher.\n\n.noreturn directive requires sm_30 or higher.\n\n.attribute directive requires sm_90 or higher.\n\nExamples\n\n.func (.reg .b32 rval) foo (.reg .b32 N, .reg .f64 dbl)\n\n{\n\n.reg .b32 localVar;\n\n... use N, dbl;\n\nother code;\n\nmov.b32 rval,result;\n\nret;\n\n}\n\n...\n\ncall (fooval), foo, (val0, val1);  // return value in fooval\n\n...\n\n.func foo (.reg .b32 N, .reg .f64 dbl) .noreturn\n\n{\n\n.reg .b32 localVar;\n\n... use N, dbl;\n\nother code;\n\nmov.b32 rval, result;\n\nret;\n\n}\n\n...\n\ncall foo, (val0, val1);\n\n...\n\n.func (.param .u32 rval) bar(.param .u32 N, .param .align 4 .b8 numbers[])\n\n{\n\n    .reg .b32 input0, input1;\n\n    ld.param.b32   input0, [numbers + 0];\n\n    ld.param.b32   input1, [numbers + 4];\n\n    ...\n\n    other code;\n\n    ret;\n\n}\n\n...\n\n.param .u32 N;\n\n.param .align 4 .b8 numbers[8];\n\nst.param.u32    [N], 2;\n\nst.param.b32    [numbers + 0], 5;\n\nst.param.b32    [numbers + 4], 10;\n\ncall (rval), bar, (N, numbers);\n\n...\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-func"
            };

        case "getctarank":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank\" target=\"_blank\" rel=\"noopener noreferrer\">getctarank <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: getctarank</h1><section id=\"data-movement-and-conversion-instructions-getctarank\">\n\n\n<p>Generate the CTA rank of the address.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>getctarank{.space}.type d, a;\n\n// Get cta rank from source shared memory address in register a.\ngetctarank.shared::cluster.type d, a;\n\n// Get cta rank from shared memory variable.\ngetctarank.shared::cluster.type d, var;\n\n// Get cta rank from shared memory variable+offset.\ngetctarank.shared::cluster.type d, var + imm;\n\n// Get cta rank from generic address of shared memory variable in register a.\ngetctarank.type d, a;\n\n.space = { .shared::cluster }\n.type  = { .u32, .u64 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Write the destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> with the rank of the CTA which contains the address specified\nin operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>Instruction type <code class=\"docutils literal notranslate\"><span class=\"pre\">.type</span></code> indicates the type of source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>When space is <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code>, source <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is either a shared memory variable or a register\ncontaining a valid shared memory address. When the optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.space</span></code> is not specified,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a register containing a generic addresses pointing to shared memory. Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is\nalways a 32-bit register which holds the rank of the CTA.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>getctarank.shared::cluster.u32 d1, addr;\ngetctarank.shared::cluster.u64 d2, sh + 4;\ngetctarank.u64                 d3, src;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Generate the CTA rank of the address.\n\nSyntax\n\ngetctarank{.space}.type d, a;\n\n// Get cta rank from source shared memory address in register a.\n\ngetctarank.shared::cluster.type d, a;\n\n// Get cta rank from shared memory variable.\n\ngetctarank.shared::cluster.type d, var;\n\n// Get cta rank from shared memory variable+offset.\n\ngetctarank.shared::cluster.type d, var + imm;\n\n// Get cta rank from generic address of shared memory variable in register a.\n\ngetctarank.type d, a;\n\n.space = { .shared::cluster }\n\n.type  = { .u32, .u64 }\n\nDescription\n\nWrite the destination register d with the rank of the CTA which contains the address specified\n\nin operand a.\n\nInstruction type .type indicates the type of source operand a.\n\nWhen space is .shared::cluster, source a is either a shared memory variable or a register\n\ncontaining a valid shared memory address. When the optional qualifier .space is not specified,\n\na is a register containing a generic addresses pointing to shared memory. Destination d is\n\nalways a 32-bit register which holds the rank of the CTA.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\ngetctarank.shared::cluster.u32 d1, addr;\n\ngetctarank.shared::cluster.u64 d2, sh + 4;\n\ngetctarank.u64                 d3, src;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank"
            };

        case "globaltimer":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi\" target=\"_blank\" rel=\"noopener noreferrer\">globaltimer <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %globaltimer, %globaltimer_lo, %globaltimer_hi</h1><section id=\"special-registers-globaltimer-globaltimer-lo-globaltimer-hi\">\n<span id=\"special-registers-globaltimer\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer</span></code></dt>\n<dd>\n<p>A predefined, 64-bit global nanosecond timer.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer_lo</span></code></dt>\n<dd>\n<p>The lower 32-bits of %globaltimer.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer_hi</span></code></dt>\n<dd>\n<p>The upper 32-bits of %globaltimer.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u64 %globaltimer;\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special registers intended for use by NVIDIA tools. The behavior is target-specific and may change\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\nunspecified.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u64  r1,%globaltimer;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%globaltimer\n\nA predefined, 64-bit global nanosecond timer.\n\n%globaltimer_lo\n\nThe lower 32-bits of %globaltimer.\n\n%globaltimer_hi\n\nThe upper 32-bits of %globaltimer.\n\nSyntax (predefined)\n\n.sreg .u64 %globaltimer;\n\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n\nDescription\n\nSpecial registers intended for use by NVIDIA tools. The behavior is target-specific and may change\n\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\n\nunspecified.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nRequires target sm_30 or higher.\n\nExamples\n\nmov.u64  r1,%globaltimer;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi"
            };

        case "globaltimer_hi":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi\" target=\"_blank\" rel=\"noopener noreferrer\">globaltimer_hi <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %globaltimer, %globaltimer_lo, %globaltimer_hi</h1><section id=\"special-registers-globaltimer-globaltimer-lo-globaltimer-hi\">\n<span id=\"special-registers-globaltimer\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer</span></code></dt>\n<dd>\n<p>A predefined, 64-bit global nanosecond timer.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer_lo</span></code></dt>\n<dd>\n<p>The lower 32-bits of %globaltimer.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer_hi</span></code></dt>\n<dd>\n<p>The upper 32-bits of %globaltimer.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u64 %globaltimer;\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special registers intended for use by NVIDIA tools. The behavior is target-specific and may change\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\nunspecified.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u64  r1,%globaltimer;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%globaltimer\n\nA predefined, 64-bit global nanosecond timer.\n\n%globaltimer_lo\n\nThe lower 32-bits of %globaltimer.\n\n%globaltimer_hi\n\nThe upper 32-bits of %globaltimer.\n\nSyntax (predefined)\n\n.sreg .u64 %globaltimer;\n\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n\nDescription\n\nSpecial registers intended for use by NVIDIA tools. The behavior is target-specific and may change\n\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\n\nunspecified.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nRequires target sm_30 or higher.\n\nExamples\n\nmov.u64  r1,%globaltimer;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi"
            };

        case "globaltimer_lo":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi\" target=\"_blank\" rel=\"noopener noreferrer\">globaltimer_lo <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %globaltimer, %globaltimer_lo, %globaltimer_hi</h1><section id=\"special-registers-globaltimer-globaltimer-lo-globaltimer-hi\">\n<span id=\"special-registers-globaltimer\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer</span></code></dt>\n<dd>\n<p>A predefined, 64-bit global nanosecond timer.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer_lo</span></code></dt>\n<dd>\n<p>The lower 32-bits of %globaltimer.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer_hi</span></code></dt>\n<dd>\n<p>The upper 32-bits of %globaltimer.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u64 %globaltimer;\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special registers intended for use by NVIDIA tools. The behavior is target-specific and may change\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\nunspecified.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u64  r1,%globaltimer;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%globaltimer\n\nA predefined, 64-bit global nanosecond timer.\n\n%globaltimer_lo\n\nThe lower 32-bits of %globaltimer.\n\n%globaltimer_hi\n\nThe upper 32-bits of %globaltimer.\n\nSyntax (predefined)\n\n.sreg .u64 %globaltimer;\n\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n\nDescription\n\nSpecial registers intended for use by NVIDIA tools. The behavior is target-specific and may change\n\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\n\nunspecified.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nRequires target sm_30 or higher.\n\nExamples\n\nmov.u64  r1,%globaltimer;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi"
            };

        case "griddepcontrol":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol\" target=\"_blank\" rel=\"noopener noreferrer\">griddepcontrol <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: griddepcontrol</h1><section id=\"parallel-synchronization-and-communication-instructions-griddepcontrol\">\n\n\n<p>Control execution of dependent grids.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>griddepcontrol.action;\n\n.action   = { .launch_dependents, .wait }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">griddepcontrol</span></code> instruction allows the dependent grids and prerequisite grids as defined by\nthe runtime, to control execution in the following way:</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.launch_dependents</span></code> modifier signals that specific dependents the runtime system designated to\nreact to this instruction can be scheduled as soon as all other CTAs in the grid issue the same\ninstruction or have completed. The dependent may launch before the completion of the current\ngrid. There is no guarantee that the dependent will launch before the completion of the current\ngrid. Repeated invocations of this instruction by threads in the current CTA will have no additional\nside effects past that of the first invocation.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.wait</span></code> modifier causes the executing thread to wait until all prerequisite grids in flight have\ncompleted and all the memory operations from the prerequisite grids are performed and made visible\nto the current grid.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p>If the prerequisite grid is using <code class=\"docutils literal notranslate\"><span class=\"pre\">griddepcontrol.launch_dependents</span></code>, then the dependent grid\nmust use <code class=\"docutils literal notranslate\"><span class=\"pre\">griddepcontrol.wait</span></code> to ensure correct functional execution.</p>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>griddepcontrol.launch_dependents;\ngriddepcontrol.wait;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Control execution of dependent grids.\n\nSyntax\n\ngriddepcontrol.action;\n\n.action   = { .launch_dependents, .wait }\n\nDescription\n\nThe griddepcontrol instruction allows the dependent grids and prerequisite grids as defined by\n\nthe runtime, to control execution in the following way:\n\n.launch_dependents modifier signals that specific dependents the runtime system designated to\n\nreact to this instruction can be scheduled as soon as all other CTAs in the grid issue the same\n\ninstruction or have completed. The dependent may launch before the completion of the current\n\ngrid. There is no guarantee that the dependent will launch before the completion of the current\n\ngrid. Repeated invocations of this instruction by threads in the current CTA will have no additional\n\nside effects past that of the first invocation.\n\n.wait modifier causes the executing thread to wait until all prerequisite grids in flight have\n\ncompleted and all the memory operations from the prerequisite grids are performed and made visible\n\nto the current grid.\n\nNote\n\nIf the prerequisite grid is using griddepcontrol.launch_dependents, then the dependent grid\n\nmust use griddepcontrol.wait to ensure correct functional execution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\ngriddepcontrol.launch_dependents;\n\ngriddepcontrol.wait;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol"
            };

        case "gridid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-gridid\" target=\"_blank\" rel=\"noopener noreferrer\">gridid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %gridid</h1><section id=\"special-registers-gridid\">\n\n\n<p>Grid identifier.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u64 %gridid;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the per-grid temporal grid identifier. The\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%gridid</span></code> is used by debuggers to distinguish CTAs and clusters within concurrent (small) grids.</p>\n<p>During execution, repeated launches of programs may occur, where each launch starts a\ngrid-of-CTAs. This variable provides the temporal grid launch number for this context.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code> targets, <code class=\"docutils literal notranslate\"><span class=\"pre\">%gridid</span></code> is limited to the range [0..2<sup>16</sup>-1]. For <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%gridid</span></code> is limited to the range [0..2<sup>32</sup>-1]. <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> supports the entire 64-bit range.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0 as type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16</span></code>.</p>\n<p>Redefined as type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> in PTX ISA version 1.3.</p>\n<p>Redefined as type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> in PTX ISA version 3.0.</p>\n<p>For compatibility with legacy PTX code, 16-bit and 32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instructions may be\nused to read the lower 16-bits or 32-bits of each component of <code class=\"docutils literal notranslate\"><span class=\"pre\">%gridid</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u64  %s, %gridid;  // 64-bit read of %gridid\nmov.u32  %r, %gridid;  // legacy code with 32-bit %gridid\n</pre></div>\n</div>\n</section>",
                "tooltip": "Grid identifier.\n\nSyntax (predefined)\n\n.sreg .u64 %gridid;\n\nDescription\n\nA predefined, read-only special register initialized with the per-grid temporal grid identifier. The\n\n%gridid is used by debuggers to distinguish CTAs and clusters within concurrent (small) grids.\n\nDuring execution, repeated launches of programs may occur, where each launch starts a\n\ngrid-of-CTAs. This variable provides the temporal grid launch number for this context.\n\nFor sm_1x targets, %gridid is limited to the range [0..216-1]. For sm_20,\n\n%gridid is limited to the range [0..232-1]. sm_30 supports the entire 64-bit range.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 as type .u16.\n\nRedefined as type .u32 in PTX ISA version 1.3.\n\nRedefined as type .u64 in PTX ISA version 3.0.\n\nFor compatibility with legacy PTX code, 16-bit and 32-bit mov and cvt instructions may be\n\nused to read the lower 16-bits or 32-bits of each component of %gridid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u64  %s, %gridid;  // 64-bit read of %gridid\n\nmov.u32  %r, %gridid;  // legacy code with 32-bit %gridid\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-gridid"
            };

        case "is_explicit_cluster":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-is-explicit-cluster\" target=\"_blank\" rel=\"noopener noreferrer\">is_explicit_cluster <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %is_explicit_cluster</h1><section id=\"special-registers-is-explicit-cluster\">\n\n\n<p>Checks if user has explicitly specified cluster launch.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .pred %is_explicit_cluster;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the predicate value of whether the cluster\nlaunch is explicitly specified by user.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .pred p;\n\nmov.pred  p, %is_explicit_cluster;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Checks if user has explicitly specified cluster launch.\n\nSyntax (predefined)\n\n.sreg .pred %is_explicit_cluster;\n\nDescription\n\nA predefined, read-only special register initialized with the predicate value of whether the cluster\n\nlaunch is explicitly specified by user.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .pred p;\n\nmov.pred  p, %is_explicit_cluster;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-is-explicit-cluster"
            };

        case "isspacep":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep\" target=\"_blank\" rel=\"noopener noreferrer\">isspacep <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: isspacep</h1><section id=\"data-movement-and-conversion-instructions-isspacep\">\n\n\n<p>Query whether a generic address falls within a specified state space window.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>isspacep.space  p, a;    // result is .pred\n\n.space = { const, .global, .local, .shared{::cta, ::cluster}, .param{::entry} };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Write predicate register <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code> if generic address a falls within the specified state\nspace window and with <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> otherwise. Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.pred</span></code>; the source address\noperand must be of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep.param{::entry}</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code> if the generic address falls within the window of\n<a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a>, otherwise returns <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code>. If <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>\nis specified without any sub-qualifiers then it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.param::entry</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep.global</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code> for <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a> as <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> window is contained within the <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code>\nwindow.</p>\n<p>If no sub-qualifier is specified with <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space, then <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> is assumed by default.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ispacep.shared::cluster</span></code> will return 1 for every shared memory address that is accessible to\nthe threads in the cluster, whereas <code class=\"docutils literal notranslate\"><span class=\"pre\">ispacep.shared::cta</span></code> will return 1 only if the address is\nof a variable declared in the executing CTA.</p>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep.const</span></code> introduced in PTX ISA version 3.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep.param</span></code> introduced in PTX ISA version 7.7.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> sub-qualifiers introduced in PTX ISA version 7.8.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::entry</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space introduced in PTX ISA version 8.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep.param{::entry}</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>isspacep.const           iscnst, cptr;\nisspacep.global          isglbl, gptr;\nisspacep.local           islcl,  lptr;\nisspacep.shared          isshrd, sptr;\nisspacep.param::entry    isparam, pptr;\nisspacep.shared::cta     isshrdcta, sptr;\nisspacep.shared::cluster ishrdany sptr;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Query whether a generic address falls within a specified state space window.\n\nSyntax\n\nisspacep.space  p, a;    // result is .pred\n\n.space = { const, .global, .local, .shared{::cta, ::cluster}, .param{::entry} };\n\nDescription\n\nWrite predicate register p with 1 if generic address a falls within the specified state\n\nspace window and with 0 otherwise. Destination p has type .pred; the source address\n\noperand must be of type .u32 or .u64.\n\nisspacep.param{::entry} returns 1 if the generic address falls within the window of\n\nKernel Function Parameters, otherwise returns 0. If .param\n\nis specified without any sub-qualifiers then it defaults to .param::entry.\n\nisspacep.global returns 1 for Kernel Function Parameters as .param window is contained within the .global\n\nwindow.\n\nIf no sub-qualifier is specified with .shared state space, then ::cta is assumed by default.\n\nNote\n\nispacep.shared::cluster will return 1 for every shared memory address that is accessible to\n\nthe threads in the cluster, whereas ispacep.shared::cta will return 1 only if the address is\n\nof a variable declared in the executing CTA.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nisspacep.const introduced in PTX ISA version 3.1.\n\nisspacep.param introduced in PTX ISA version 7.7.\n\nSupport for ::cta and ::cluster sub-qualifiers introduced in PTX ISA version 7.8.\n\nSupport for sub-qualifier ::entry on .param space introduced in PTX ISA version 8.3.\n\nTarget ISA Notes\n\nisspacep requires sm_20 or higher.\n\nisspacep.param{::entry} requires sm_70 or higher.\n\nSub-qualifier ::cta requires sm_30 or higher.\n\nSub-qualifier ::cluster requires sm_90 or higher.\n\nExamples\n\nisspacep.const           iscnst, cptr;\n\nisspacep.global          isglbl, gptr;\n\nisspacep.local           islcl,  lptr;\n\nisspacep.shared          isshrd, sptr;\n\nisspacep.param::entry    isparam, pptr;\n\nisspacep.shared::cta     isshrdcta, sptr;\n\nisspacep.shared::cluster ishrdany sptr;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep"
            };

        case "istypep":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep\" target=\"_blank\" rel=\"noopener noreferrer\">istypep <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Texture Instructions: istypep</h1><section id=\"texture-instructions-istypep\">\n\n\n<p>Query whether a register points to an opaque variable of a specified type.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>istypep.type   p, a;  // result is .pred\n\n.type = { .texref, .samplerref, .surfref };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Write predicate register <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> with 1 if register <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> points to an opaque variable of the\nspecified type, and with 0 otherwise. Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.pred</span></code>; the source address\noperand must be of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>istypep requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>istypep.texref istex, tptr;\nistypep.samplerref issampler, sptr;\nistypep.surfref issurface, surfptr;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Query whether a register points to an opaque variable of a specified type.\n\nSyntax\n\nistypep.type   p, a;  // result is .pred\n\n.type = { .texref, .samplerref, .surfref };\n\nDescription\n\nWrite predicate register p with 1 if register a points to an opaque variable of the\n\nspecified type, and with 0 otherwise. Destination p has type .pred; the source address\n\noperand must be of type .u64.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 4.0.\n\nTarget ISA Notes\n\nistypep requires sm_30 or higher.\n\nExamples\n\nistypep.texref istex, tptr;\n\nistypep.samplerref issampler, sptr;\n\nistypep.surfref issurface, surfptr;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep"
            };

        case "laneid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-laneid\" target=\"_blank\" rel=\"noopener noreferrer\">laneid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %laneid</h1><section id=\"special-registers-laneid\">\n\n\n<p>Lane Identifier.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %laneid;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register that returns the thread\u2019s lane within the warp. The lane\nidentifier ranges from zero to <code class=\"docutils literal notranslate\"><span class=\"pre\">WARP_SZ-1</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32  %r, %laneid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Lane Identifier.\n\nSyntax (predefined)\n\n.sreg .u32 %laneid;\n\nDescription\n\nA predefined, read-only special register that returns the thread\u2019s lane within the warp. The lane\n\nidentifier ranges from zero to WARP_SZ-1.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32  %r, %laneid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-laneid"
            };

        case "lanemask_eq":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-eq\" target=\"_blank\" rel=\"noopener noreferrer\">lanemask_eq <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %lanemask_eq</h1><section id=\"special-registers-lanemask-eq\">\n\n\n<p>32-bit mask with bit set in position equal to the thread\u2019s lane number in the warp.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %lanemask_eq;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with a 32-bit mask with a bit set in the\nposition equal to the thread\u2019s lane number in the warp.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%lanemask_eq</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32     %r, %lanemask_eq;\n</pre></div>\n</div>\n</section>",
                "tooltip": "32-bit mask with bit set in position equal to the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_eq;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with a bit set in the\n\nposition equal to the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_eq requires sm_20 or higher.\n\nExamples\n\nmov.u32     %r, %lanemask_eq;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-eq"
            };

        case "lanemask_ge":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-ge\" target=\"_blank\" rel=\"noopener noreferrer\">lanemask_ge <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %lanemask_ge</h1><section id=\"special-registers-lanemask-ge\">\n\n\n<p>32-bit mask with bits set in positions greater than or equal to the thread\u2019s lane number in the warp.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %lanemask_ge;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with a 32-bit mask with bits set in positions\ngreater than or equal to the thread\u2019s lane number in the warp.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%lanemask_ge</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32     %r, %lanemask_ge;\n</pre></div>\n</div>\n</section>",
                "tooltip": "32-bit mask with bits set in positions greater than or equal to the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_ge;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with bits set in positions\n\ngreater than or equal to the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_ge requires sm_20 or higher.\n\nExamples\n\nmov.u32     %r, %lanemask_ge;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-ge"
            };

        case "lanemask_gt":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-gt\" target=\"_blank\" rel=\"noopener noreferrer\">lanemask_gt <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %lanemask_gt</h1><section id=\"special-registers-lanemask-gt\">\n\n\n<p>32-bit mask with bits set in positions greater than the thread\u2019s lane number in the warp.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %lanemask_gt;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with a 32-bit mask with bits set in positions\ngreater than the thread\u2019s lane number in the warp.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%lanemask_gt</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32     %r, %lanemask_gt;\n</pre></div>\n</div>\n</section>",
                "tooltip": "32-bit mask with bits set in positions greater than the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_gt;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with bits set in positions\n\ngreater than the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_gt requires sm_20 or higher.\n\nExamples\n\nmov.u32     %r, %lanemask_gt;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-gt"
            };

        case "lanemask_le":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-le\" target=\"_blank\" rel=\"noopener noreferrer\">lanemask_le <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %lanemask_le</h1><section id=\"special-registers-lanemask-le\">\n\n\n<p>32-bit mask with bits set in positions less than or equal to the thread\u2019s lane number in the warp.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %lanemask_le;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with a 32-bit mask with bits set in positions\nless than or equal to the thread\u2019s lane number in the warp.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%lanemask_le</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32     %r, %lanemask_le\n</pre></div>\n</div>\n</section>",
                "tooltip": "32-bit mask with bits set in positions less than or equal to the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_le;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with bits set in positions\n\nless than or equal to the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_le requires sm_20 or higher.\n\nExamples\n\nmov.u32     %r, %lanemask_le\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-le"
            };

        case "lanemask_lt":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-lt\" target=\"_blank\" rel=\"noopener noreferrer\">lanemask_lt <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %lanemask_lt</h1><section id=\"special-registers-lanemask-lt\">\n\n\n<p>32-bit mask with bits set in positions less than the thread\u2019s lane number in the warp.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %lanemask_lt;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with a 32-bit mask with bits set in positions\nless than the thread\u2019s lane number in the warp.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%lanemask_lt</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32     %r, %lanemask_lt;\n</pre></div>\n</div>\n</section>",
                "tooltip": "32-bit mask with bits set in positions less than the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_lt;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with bits set in positions\n\nless than the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_lt requires sm_20 or higher.\n\nExamples\n\nmov.u32     %r, %lanemask_lt;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-lt"
            };

        case "ld":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld\" target=\"_blank\" rel=\"noopener noreferrer\">ld <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc\" target=\"_blank\" rel=\"noopener noreferrer\">ld.global.nc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: ld</h1><section id=\"data-movement-and-conversion-instructions-ld\">\n\n\n<p>Load a register variable from an addressable state space variable.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>ld{.weak}{.ss}{.cop}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{.unified}{, cache-policy};\n\nld{.weak}{.ss}{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{.unified}{, cache-policy};\n\nld.volatile{.ss}{.level::prefetch_size}{.vec}.type  d, [a];\n\nld.relaxed.scope{.ss}{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{, cache-policy};\n\nld.acquire.scope{.ss}{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{, cache-policy};\n\nld.mmio.relaxed.sys{.global}.type  d, [a];\n\n.ss =                       { .const, .global, .local, .param{::entry, ::func}, .shared{::cta, ::cluster} };\n.cop =                      { .ca, .cg, .cs, .lu, .cv };\n.level::eviction_priority = { .L1::evict_normal, .L1::evict_unchanged,\n                              .L1::evict_first, .L1::evict_last, .L1::no_allocate };\n.level::cache_hint =        { .L2::cache_hint };\n.level::prefetch_size =     { .L2::64B, .L2::128B, .L2::256B }\n.scope =                    { .cta, .cluster, .gpu, .sys };\n.vec =                      { .v2, .v4 };\n.type =                     { .b8, .b16, .b32, .b64, .b128,\n                              .u8, .u16, .u32, .u64,\n                              .s8, .s16, .s32, .s64,\n                              .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Load register variable <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> from the location specified by the source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> in\nspecified state space. If no state space is given, perform the load using <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a>.</p>\n<p>If no sub-qualifier is specified with <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space, then <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> is assumed by default.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses\nas Operands</a></p>\n<p>If no sub-qualifier is specified with <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> state space, then\u202f:</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">::func</span></code> is assumed when access is inside a device function.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">::entry</span></code> is assumed when accessing kernel function parameters from entry function. Otherwise, when\naccessing device function parameters or any other <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> variables from entry function <code class=\"docutils literal notranslate\"><span class=\"pre\">::func</span></code>\nis assumed by default.</p></li>\n</ul>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.param::entry</span></code> instruction, operand a must be a kernel parameter address, otherwise behavior\nis undefined. For <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.param::func</span></code> instruction, operand a must be a device function parameter address,\notherwise behavior is undefined.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.param{::func}</span></code> used for reading value returned from device function call cannot be\npredicated. See <a class=\"reference external\" href=\"#parameter-state-space\">Parameter State Space</a> and\n<a class=\"reference external\" href=\"#function-declarations-and-definitions\">Function Declarations and Definitions</a> for descriptions\nof the proper use of <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.param</span></code>.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code> qualifiers indicate memory synchronization as described in the\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier\nindicates the set of threads with which an <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.relaxed</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.acquire</span></code> instruction can directly\nsynchronize<sup>1</sup>. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> qualifier indicates a memory instruction with no synchronization.\nThe effects of this instruction become visible to other threads only when synchronization is established\nby other means.</p>\n<p>The semantic details of <code class=\"docutils literal notranslate\"><span class=\"pre\">.mmio</span></code> qualifier are described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. Only <code class=\"docutils literal notranslate\"><span class=\"pre\">.sys</span></code> thread scope is valid for <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.mmio</span></code> operation. The\nqualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.mmio</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> must be specified together.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.volatile</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code> qualifiers are mutually exclusive. When\nnone of these is specified, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> qualifier is assumed by default.</p>\n<p>An <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.volatile</span></code> operation is always performed and it will not be reordered with respect to other\n<code class=\"docutils literal notranslate\"><span class=\"pre\">volatile</span></code> operations to the same memory location. <code class=\"docutils literal notranslate\"><span class=\"pre\">volatile</span></code> and non-volatile load operations\nto the same memory location may be reordered. <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.volatile</span></code> has the same memory synchronization\nsemantics as <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.relaxed.sys</span></code>.</p>\n<p>The qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.volatile</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code> may be used only with <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> spaces and with generic addressing, where the address points to <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> space. Cache operations are not permitted with these qualifiers. The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.mmio</span></code>\nmay be used only with <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> space and with generic addressing, where the address points to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> space.</p>\n<p>The optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.unified</span></code> must be specified on operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> if <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is the address of a\nvariable declared with <code class=\"docutils literal notranslate\"><span class=\"pre\">.unified</span></code> attribute as described in <a class=\"reference external\" href=\"#variable-and-function-attribute-directive-attribute\">Variable and Function Attribute\nDirective: .attribute</a>.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> specifies the eviction policy that will be used during\nmemory access.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifier is a hint to fetch additional data of the specified size\ninto the respective cache level.The sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch_size</span></code> can be set to either of <code class=\"docutils literal notranslate\"><span class=\"pre\">64B</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">128B</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">256B</span></code> thereby allowing the prefetch size to be 64 Bytes, 128 Bytes or 256 Bytes\nrespectively.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> may only be used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space and with\ngeneric addressing where the address points to <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space. If the generic address does\nnot fall within the address window of the global memory, then the prefetching behavior is undefined.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifier is treated as a performance hint only.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p>The qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.unified</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> are only supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state\nspace and for generic addressing where the address points to the <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><sup>1</sup> This synchronization is further extended to other threads through the transitive nature of\n<em>causality order</em>, as described in the memory consistency model.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a;             // named variable a\nd = *(&amp;a+immOff)   // variable-plus-offset\nd = *a;            // register\nd = *(a+immOff);   // register-plus-offset\nd = *(immAddr);    // immediate address\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> must be in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.reg</span></code> state space.</p>\n<p>A destination register wider than the specified type may be used. The value loaded is sign-extended\nto the destination register width for signed integers, and is zero-extended to the destination\nregister width for unsigned and bit-size types. See\n<a class=\"reference internal\" href=\"#operand-size-exceeding-instruction-type-size-relaxed-type-checking-rules-destination-operands\"><span class=\"std std-numref\">Table 27</span></a>\nfor a description of these relaxed type-checking rules.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> data may be loaded using <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.b16</span></code>, and then converted to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> using\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> or can be used in half precision floating point instructions.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> data may be loaded using <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.b32</span></code> and then used in half precision floating point\ninstructions.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>ld introduced in PTX ISA version 1.0. <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.volatile</span></code> introduced in PTX ISA version 1.1.</p>\n<p>Generic addressing and cache operations introduced in PTX ISA version 2.0.</p>\n<p>Support for scope qualifier, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> qualifiers introduced in PTX ISA\nversion 6.0.</p>\n<p>Support for generic addressing of .const space added in PTX ISA version 3.1.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code>\nqualifiers introduced in PTX ISA version 7.4.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> sub-qualifiers introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.unified</span></code> qualifier introduced in PTX ISA version 8.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.mmio</span></code> qualifier introduced in PTX ISA version 8.2.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">::entry</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">::func</span></code> sub-qualifiers on <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space introduced in PTX ISA\nversion 8.3.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.b128</span></code> type introduced in PTX ISA version 8.3.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.sys</span></code> scope with <code class=\"docutils literal notranslate\"><span class=\"pre\">.b128</span></code> type introduced in PTX ISA version 8.4.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ld.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>Support for scope qualifier, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> qualifiers require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or\nhigher.</p>\n<p>Generic addressing requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Cache operations require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_75</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::256B</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::cache_hint</span></code> qualifiers requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.unified</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.mmio</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.b128</span></code> type requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>ld.global.f32    d,[a];\nld.shared.v4.b32 Q,[p];\nld.const.s32     d,[p+4];\nld.local.b32     x,[p+-8]; // negative offset\nld.local.b64     x,[240];  // immediate address\n\nld.global.b16    %r,[fs];  // load .f16 data into 32-bit reg\ncvt.f32.f16      %r,%r;    // up-convert f16 data to f32\n\nld.global.b32    %r0, [fs];     // load .f16x2 data in 32-bit reg\nld.global.b32    %r1, [fs + 4]; // load .f16x2 data in 32-bit reg\nadd.rn.f16x2     %d0, %r0, %r1; // addition of f16x2 data\nld.global.relaxed.gpu.u32 %r0, [gbl];\nld.shared.acquire.gpu.u32 %r1, [sh];\nld.global.relaxed.cluster.u32 %r2, [gbl];\nld.shared::cta.acquire.gpu.u32 %r2, [sh + 4];\nld.shared::cluster.u32 %r3, [sh + 8];\nld.global.mmio.relaxed.sys.u32 %r3, [gbl];\n\nld.global.f32    d,[ugbl].unified;\nld.b32           %r0, [%r1].unified;\n\nld.global.L1::evict_last.u32  d, [p];\n\nld.global.L2::64B.b32   %r0, [gbl]; // Prefetch 64B to L2\nld.L2::128B.f64         %r1, [gbl]; // Prefetch 128B to L2\nld.global.L2::256B.f64  %r2, [gbl]; // Prefetch 256B to L2\n\ncreatepolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 cache-policy, 1;\nld.global.L2::cache_hint.b64  x, [p], cache-policy;\nld.param::entry.b32 %rp1, [kparam1];\n\nld.global.b128   %r0, [gbl];   // 128-bit load\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: ld.global.nc</h1><section id=\"data-movement-and-conversion-instructions-ld-global-nc\">\n\n\n<p>Load a register variable from global state space via non-coherent cache.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>ld.global{.cop}.nc{.level::cache_hint}{.level::prefetch_size}.type                 d, [a]{, cache-policy};\nld.global{.cop}.nc{.level::cache_hint}{.level::prefetch_size}.vec.type             d, [a]{, cache-policy};\n\nld.global.nc{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}.type      d, [a]{, cache-policy};\nld.global.nc{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}.vec.type  d, [a]{, cache-policy};\n\n.cop  =                     { .ca, .cg, .cs };     // cache operation\n.level::eviction_priority = { .L1::evict_normal, .L1::evict_unchanged,\n                              .L1::evict_first, .L1::evict_last, .L1::no_allocate};\n.level::cache_hint =        { .L2::cache_hint };\n.level::prefetch_size =     { .L2::64B, .L2::128B, .L2::256B }\n.vec  =                     { .v2, .v4 };\n.type =                     { .b8, .b16, .b32, .b64, .b128,\n                              .u8, .u16, .u32, .u64,\n                              .s8, .s16, .s32, .s64,\n                              .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Load register variable <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> from the location specified by the source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> in the\nglobal state space, and optionally cache in non-coherent read-only cache.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p>On some architectures, the texture cache is larger, has higher bandwidth, and longer latency than\nthe global memory cache. For applications with sufficient parallelism to cover the longer\nlatency, <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.global.nc</span></code> should offer better performance than <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.global</span></code> on such\narchitectures.</p>\n</div>\n<p>The address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> may contain a <a class=\"reference external\" href=\"#generic-addressing\">generic address</a> pointing to the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space. Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and alignment requirements are\ndescribed in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a></p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> specifies the eviction policy that will be used during\nmemory access.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifier is a hint to fetch additional data of the specified size\ninto the respective cache level.The sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch_size</span></code> can be set to either of <code class=\"docutils literal notranslate\"><span class=\"pre\">64B</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">128B</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">256B</span></code> thereby allowing the prefetch size to be 64 Bytes, 128 Bytes or 256 Bytes\nrespectively.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifier is treated as a performance hint only.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a;             // named variable a\nd = *(&amp;a+immOff)   // variable-plus-offset\nd = *a;            // register\nd = *(a+immOff);   // register-plus-offset\nd = *(immAddr);    // immediate address\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> must be in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.reg</span></code> state space.</p>\n<p>A destination register wider than the specified type may be used. The value loaded is sign-extended\nto the destination register width for signed integers, and is zero-extended to the destination\nregister width for unsigned and bit-size types.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> data may be loaded using <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.b16</span></code>, and then converted to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> using <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.1.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code>\nqualifiers introduced in PTX ISA version 7.4.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.b128</span></code> type introduced in PTX ISA version 8.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_32</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_75</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.b128</span></code> type requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>ld.global.nc.f32           d, [a];\nld.gloal.nc.L1::evict_last.u32 d, [a];\n\ncreatepolicy.fractional.L2::evict_last.b64 cache-policy, 0.5;\nld.global.nc.L2::cache_hint.f32  d, [a], cache-policy;\n\nld.global.nc.L2::64B.b32      d,  [a];     // Prefetch 64B to L2\nld.global.nc.L2::256B.f64     d,  [a];     // Prefetch 256B to L2\n\nld.global.nc.b128             d,  [a];\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Data Movement and Conversion Instructions: ld\n\n\n\nLoad a register variable from an addressable state space variable.\n\nSyntax\n\nld{.weak}{.ss}{.cop}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{.unified}{, cache-policy};\n\nld{.weak}{.ss}{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{.unified}{, cache-policy};\n\nld.volatile{.ss}{.level::prefetch_size}{.vec}.type  d, [a];\n\nld.relaxed.scope{.ss}{.le...\n\n=====Data Movement and Conversion Instructions: ld.global.nc\n\n\n\nLoad a register variable from global state space via non-coherent cache.\n\nSyntax\n\nld.global{.cop}.nc{.level::cache_hint}{.level::prefetch_size}.type                 d, [a]{, cache-policy};\n\nld.global{.cop}.nc{.level::cache_hint}{.level::prefetch_size}.vec.type             d, [a]{, cache-policy};\n\nld.global.nc{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}.type      d, [a]{, cach... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld"
            };

        case "ldu":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu\" target=\"_blank\" rel=\"noopener noreferrer\">ldu <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: ldu</h1><section id=\"data-movement-and-conversion-instructions-ldu\">\n\n\n<p>Load read-only data from an address that is common across threads in the warp.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>ldu{.ss}.type      d, [a];       // load from address\nldu{.ss}.vec.type  d, [a];       // vec load from address\n\n.ss   = { .global };             // state space\n.vec  = { .v2, .v4 };\n.type = { .b8, .b16, .b32, .b64, .b128,\n          .u8, .u16, .u32, .u64,\n          .s8, .s16, .s32, .s64,\n                     .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Load <em>read-only</em> data into register variable <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> from the location specified by the source address\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> in the global state space, where the address is guaranteed to be the same across all\nthreads in the warp. If no state space is given, perform the load using <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a>.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses\nas Operands</a></p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a;             // named variable a\nd = *(&amp;a+immOff)   // variable-plus-offset\nd = *a;            // register\nd = *(a+immOff);   // register-plus-offset\nd = *(immAddr);    // immediate address\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> must be in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.reg</span></code> state space.</p>\n<p>A destination register wider than the specified type may be used. The value loaded is sign-extended\nto the destination register width for signed integers, and is zero-extended to the destination\nregister width for unsigned and bit-size types. See\n<a class=\"reference internal\" href=\"#operand-size-exceeding-instruction-type-size-relaxed-type-checking-rules-destination-operands\"><span class=\"std std-numref\">Table 27</span></a>\nfor a description of these relaxed type-checking rules.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> data may be loaded using <code class=\"docutils literal notranslate\"><span class=\"pre\">ldu.b16</span></code>, and then converted to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> using\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code>or can be used in half precision floating point instructions.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> data may be loaded using <code class=\"docutils literal notranslate\"><span class=\"pre\">ldu.b32</span></code> and then used in half precision floating point\ninstructions.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.b128</span></code> type introduced in PTX ISA version 8.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ldu.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.b128</span></code> type requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>ldu.global.f32    d,[a];\nldu.global.b32    d,[p+4];\nldu.global.v4.f32 Q,[p];\nldu.global.b128   d,[a];\n</pre></div>\n</div>\n</section>",
                "tooltip": "Load read-only data from an address that is common across threads in the warp.\n\nSyntax\n\nldu{.ss}.type      d, [a];       // load from address\n\nldu{.ss}.vec.type  d, [a];       // vec load from address\n\n.ss   = { .global };             // state space\n\n.vec  = { .v2, .v4 };\n\n.type = { .b8, .b16, .b32, .b64, .b128,\n\n          .u8, .u16, .u32, .u64,\n\n          .s8, .s16, .s32, .s64,\n\n                     .f32, .f64 };\n\nDescription\n\nLoad read-only data into register variable d from the location specified by the source address\n\noperand a in the global state space, where the address is guaranteed to be the same across all\n\nthreads in the warp. If no state space is given, perform the load using Generic Addressing.\n\nSupported addressing modes for operand a and alignment requirements are described in Addresses\n\nas Operands\n\nSemantics\n\nd = a;             // named variable a\n\nd = *(&a+immOff)   // variable-plus-offset\n\nd = *a;            // register\n\nd = *(a+immOff);   // register-plus-offset\n\nd = *(immAddr);    // immediate address\n\nNotes\n\nDestination d must be in the .reg state space.\n\nA destination register wider than the specified type may be used. The value loaded is sign-extended\n\nto the destination register width for signed integers, and is zero-extended to the destination\n\nregister width for unsigned and bit-size types. See\n\nTable 27\n\nfor a description of these relaxed type-checking rules.\n\n.f16 data may be loaded using ldu.b16, and then converted to .f32 or .f64 using\n\ncvtor can be used in half precision floating point instructions.\n\n.f16x2 data may be loaded using ldu.b32 and then used in half precision floating point\n\ninstructions.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nSupport for .b128 type introduced in PTX ISA version 8.3.\n\nTarget ISA Notes\n\nldu.f64 requires sm_13 or higher.\n\nSupport for .b128 type requires sm_70 or higher.\n\nExamples\n\nldu.global.f32    d,[a];\n\nldu.global.b32    d,[p+4];\n\nldu.global.v4.f32 Q,[p];\n\nldu.global.b128   d,[a];\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu"
            };

        case "lg2":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2\" target=\"_blank\" rel=\"noopener noreferrer\">lg2(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: lg2</h1><section id=\"floating-point-instructions-lg2\">\n\n\n<p>Find the base-2 logarithm of a value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>lg2.approx{.ftz}.f32  d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Determine the log<sub>2</sub> of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = log(a) / log(2);\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">lg2.approx.f32</span></code> implements a fast approximation to log<sub>2</sub>(a).</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 57%\"/>\n<col style=\"width: 43%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p>-Inf</p></td>\n<td><p>NaN</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>-subnormal</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>-0.0</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>+0.0</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>+subnormal</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>+Inf</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>The maximum absolute error is 2<sup>-22.6</sup> for mantissa.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">lg2.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p>Subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">lg2.f32</span></code> introduced in PTX ISA version 1.0. Explicit modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code>\nintroduced in PTX ISA version 1.4.</p>\n<p>For PTX ISA version 1.4 and later, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> modifier is required.</p>\n<p>For PTX ISA versions 1.0 through 1.3, <code class=\"docutils literal notranslate\"><span class=\"pre\">lg2.f32</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">lg2.approx.ftz.f32</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>lg2.approx.ftz.f32  la, a;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Find the base-2 logarithm of a value.\n\nSyntax\n\nlg2.approx{.ftz}.f32  d, a;\n\nDescription\n\nDetermine the log2 of a.\n\nSemantics\n\nd = log(a) / log(2);\n\nNotes\n\nlg2.approx.f32 implements a fast approximation to log2(a).\n\n\n\nInput\n\nResult\n\n\n\n-Inf\n\nNaN\n\n-subnormal\n\n-Inf\n\n-0.0\n\n-Inf\n\n+0.0\n\n-Inf\n\n+subnormal\n\n-Inf\n\n+Inf\n\n+Inf\n\nNaN\n\nNaN\n\nThe maximum absolute error is 2-22.6 for mantissa.\n\nSubnormal numbers:\n\nsm_20+\n\nBy default, subnormal numbers are supported.\n\nlg2.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1x\n\nSubnormal inputs and results to sign-preserving zero.\n\nPTX ISA Notes\n\nlg2.f32 introduced in PTX ISA version 1.0. Explicit modifiers .approx and .ftz\n\nintroduced in PTX ISA version 1.4.\n\nFor PTX ISA version 1.4 and later, the .approx modifier is required.\n\nFor PTX ISA versions 1.0 through 1.3, lg2.f32 defaults to lg2.approx.ftz.f32.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nlg2.approx.ftz.f32  la, a;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2"
            };

        case "loc":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-loc\" target=\"_blank\" rel=\"noopener noreferrer\">loc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Debugging Directives: .loc</h1><section id=\"debugging-directives-loc\">\n\n\n<p>Source file location.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.loc file_index line_number column_position\n.loc file_index line_number column_position,function_name label {+ immediate }, inlined_at file_index2 line_number2 column_position2\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares the source file location (source file, line number, and column position) to be associated\nwith lexically subsequent PTX instructions. <code class=\"docutils literal notranslate\"><span class=\"pre\">.loc</span></code> refers to <code class=\"docutils literal notranslate\"><span class=\"pre\">file_index</span></code> which is defined by a\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.file</span></code> directive.</p>\n<p>To indicate PTX instructions that are generated from a function that got inlined, additional\nattribute <code class=\"docutils literal notranslate\"><span class=\"pre\">.inlined_at</span></code> can be specified as part of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.loc</span></code> directive. <code class=\"docutils literal notranslate\"><span class=\"pre\">.inlined_at</span></code>\nattribute specifies source location at which the specified function is inlined. <code class=\"docutils literal notranslate\"><span class=\"pre\">file_index2</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">line_number2</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">column_position2</span></code> specify the location at which function is inlined. Source\nlocation specified as part of <code class=\"docutils literal notranslate\"><span class=\"pre\">.inlined_at</span></code> directive must lexically precede as source location in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.loc</span></code> directive.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">function_name</span></code> attribute specifies an offset in the DWARF section named\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.debug_str</span></code>. Offset is specified as <code class=\"docutils literal notranslate\"><span class=\"pre\">label</span></code> expression or <code class=\"docutils literal notranslate\"><span class=\"pre\">label</span> <span class=\"pre\">+</span> <span class=\"pre\">immediate</span></code> expression\nwhere <code class=\"docutils literal notranslate\"><span class=\"pre\">label</span></code> is defined in <code class=\"docutils literal notranslate\"><span class=\"pre\">.debug_str</span></code> section. DWARF section <code class=\"docutils literal notranslate\"><span class=\"pre\">.debug_str</span></code> contains ASCII\nnull-terminated strings that specify the name of the function that is inlined.</p>\n<p>Note that a PTX instruction may have a single associated source location, determined by the nearest\nlexically preceding .loc directive, or no associated source location if there is no preceding .loc\ndirective. Labels in PTX inherit the location of the closest lexically following instruction. A\nlabel with no following PTX instruction has no associated source location.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">function_name</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">inlined_at</span></code> attributes are introduced in PTX ISA version 7.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>    .loc 2 4237 0\nL1:                        // line 4237, col 0 of file #2,\n                           // inherited from mov\n    mov.u32  %r1,%r2;      // line 4237, col 0 of file #2\n    add.u32  %r2,%r1,%r3;  // line 4237, col 0 of file #2\n...\nL2:                        // line 4239, col 5 of file #2,\n                           // inherited from sub\n    .loc 2 4239 5\n    sub.u32  %r2,%r1,%r3;  // line 4239, col 5 of file #2\n    .loc 1 21 3\n    .loc 1 9 3, function_name info_string0, inlined_at 1 21 3\n    ld.global.u32   %r1, [gg]; // Function at line 9\n    setp.lt.s32 %p1, %r1, 8;   // inlined at line 21\n    .loc 1 27 3\n    .loc 1 10 5, function_name info_string1, inlined_at 1 27 3\n    .loc 1 15 3, function_name .debug_str+16, inlined_at 1 10 5\n    setp.ne.s32 %p2, %r1, 18;\n    @%p2 bra    BB2_3;\n\n    .section .debug_str {\n    info_string0:\n     .b8 95  // _\n     .b8 90  // z\n     .b8 51  // 3\n     .b8 102 // f\n     .b8 111 // o\n     .b8 111 // o\n     .b8 118 // v\n     .b8 0\n\n    info_string1:\n     .b8 95  // _\n     .b8 90  // z\n     .b8 51  // 3\n     .b8 98  // b\n     .b8 97  // a\n     .b8 114 // r\n     .b8 118 // v\n     .b8 0\n     .b8 95  // _\n     .b8 90  // z\n     .b8 51  // 3\n     .b8 99  // c\n     .b8 97  // a\n     .b8 114 // r\n     .b8 118 // v\n     .b8 0\n    }\n</pre></div>\n</div>\n</section>",
                "tooltip": "Source file location.\n\nSyntax\n\n.loc file_index line_number column_position\n\n.loc file_index line_number column_position,function_name label {+ immediate }, inlined_at file_index2 line_number2 column_position2\n\nDescription\n\nDeclares the source file location (source file, line number, and column position) to be associated\n\nwith lexically subsequent PTX instructions. .loc refers to file_index which is defined by a\n\n.file directive.\n\nTo indicate PTX instructions that are generated from a function that got inlined, additional\n\nattribute .inlined_at can be specified as part of the .loc directive. .inlined_at\n\nattribute specifies source location at which the specified function is inlined. file_index2,\n\nline_number2, and column_position2 specify the location at which function is inlined. Source\n\nlocation specified as part of .inlined_at directive must lexically precede as source location in\n\n.loc directive.\n\nThe function_name attribute specifies an offset in the DWARF section named\n\n.debug_str. Offset is specified as label expression or label + immediate expression\n\nwhere label is defined in .debug_str section. DWARF section .debug_str contains ASCII\n\nnull-terminated strings that specify the name of the function that is inlined.\n\nNote that a PTX instruction may have a single associated source location, determined by the nearest\n\nlexically preceding .loc directive, or no associated source location if there is no preceding .loc\n\ndirective. Labels in PTX inherit the location of the closest lexically following instruction. A\n\nlabel with no following PTX instruction has no associated source location.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nfunction_name and inlined_at attributes are introduced in PTX ISA version 7.2.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n    .loc 2 4237 0\n\nL1:                        // line 4237, col 0 of file #2,\n\n                           // inherited from mov\n\n    mov.u32  %r1,%r2;      // line 4237, col 0 of file #2\n\n    add.u32  %r2,%r1,%r3;  // line 4237, col 0 of file #2\n\n...\n\nL2:                        // line 4239, col 5 of file #2,\n\n                           // inherited from sub\n\n    .loc 2 4239 5\n\n    sub.u32  %r2,%r1,%r3;  // line 4239, col 5 of file #2\n\n    .loc 1 21 3\n\n    .loc 1 9 3, function_name info_string0, inlined_at 1 21 3\n\n    ld.global.u32   %r1, [gg]; // Function at line 9\n\n    setp.lt.s32 %p1, %r1, 8;   // inlined at line 21\n\n    .loc 1 27 3\n\n    .loc 1 10 5, function_name info_string1, inlined_at 1 27 3\n\n    .loc 1 15 3, function_name .debug_str+16, inlined_at 1 10 5\n\n    setp.ne.s32 %p2, %r1, 18;\n\n    @%p2 bra    BB2_3;\n\n    .section .debug_str {\n\n    info_string0:\n\n     .b8 95  // _\n\n     .b8 90  // z\n\n     .b8 51  // 3\n\n     .b8 102 // f\n\n     .b8 111 // o\n\n     .b8 111 // o\n\n     .b8 118 // v\n\n     .b8 0\n\n    info_string1:\n\n     .b8 95  // _\n\n     .b8 90  // z\n\n     .b8 51  // 3\n\n     .b8 98  // b\n\n     .b8 97  // a\n\n     .b8 114 // r\n\n     .b8 118 // v\n\n     .b8 0\n\n     .b8 95  // _\n\n     .b8 90  // z\n\n     .b8 51  // 3\n\n     .b8 99  // c\n\n     .b8 97  // a\n\n     .b8 114 // r\n\n     .b8 118 // v\n\n     .b8 0\n\n    }\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-loc"
            };

        case "lop3":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3\" target=\"_blank\" rel=\"noopener noreferrer\">lop3 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: lop3</h1><section id=\"logic-and-shift-instructions-lop3\">\n\n\n<p>Arbitrary logical operation on 3 inputs.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>lop3.b32 d, a, b, c, immLut;\nlop3.BoolOp.b32 d|p, a, b, c, immLut, q;\n\n.BoolOp   = { .or , .and };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute bitwise logical operation on inputs <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> and store the result in destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>Optionally, <code class=\"docutils literal notranslate\"><span class=\"pre\">.BoolOp</span></code> can be specified to compute the predicate result <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> by performing a\nBoolean operation on the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> with the predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">q</span></code> in the following manner:</p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>p = (d != 0) BoolOp q;\n</pre></div>\n</div>\n<p>The sink symbol \u2018_\u2019 may be used in place of the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> when <code class=\"docutils literal notranslate\"><span class=\"pre\">.BoolOp</span></code> qualifier\nis specified.</p>\n<p>The logical operation is defined by a look-up table which, for 3 inputs, can be represented as an\n8-bit value specified by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">immLut</span></code> as described below. <code class=\"docutils literal notranslate\"><span class=\"pre\">immLut</span></code> is an integer constant\nthat can take values from 0 to 255, thereby allowing up to 256 distinct logical operations on inputs\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>.</p>\n<p>For a logical operation <code class=\"docutils literal notranslate\"><span class=\"pre\">F(a,</span> <span class=\"pre\">b,</span> <span class=\"pre\">c)</span></code> the value of <code class=\"docutils literal notranslate\"><span class=\"pre\">immLut</span></code> can be computed by applying the same\noperation to three predefined constant values as follows:</p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>ta = 0xF0;\ntb = 0xCC;\ntc = 0xAA;\n\nimmLut = F(ta, tb, tc);\n</pre></div>\n</div>\n<p>Examples:</p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>If F = (a &amp; b &amp; c);\nimmLut = 0xF0 &amp; 0xCC &amp; 0xAA = 0x80\n\nIf F = (a | b | c);\nimmLut = 0xF0 | 0xCC | 0xAA = 0xFE\n\nIf F = (a &amp; b &amp; ~c);\nimmLut = 0xF0 &amp; 0xCC &amp; (~0xAA) = 0x40\n\nIf F = ((a &amp; b | c) ^ a);\nimmLut = (0xF0 &amp; 0xCC | 0xAA) ^ 0xF0 = 0x1A\n</pre></div>\n</div>\n<p>The following table illustrates computation of <code class=\"docutils literal notranslate\"><span class=\"pre\">immLut</span></code> for various logical operations:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 9%\"/>\n<col style=\"width: 3%\"/>\n<col style=\"width: 3%\"/>\n<col style=\"width: 12%\"/>\n<col style=\"width: 17%\"/>\n<col style=\"width: 18%\"/>\n<col style=\"width: 7%\"/>\n<col style=\"width: 20%\"/>\n<col style=\"width: 13%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>ta</p></th>\n<th class=\"head\"><p>tb</p></th>\n<th class=\"head\"><p>tc</p></th>\n<th class=\"head\"><p>Oper 0 (False)</p></th>\n<th class=\"head\"><p>Oper 1 (ta &amp; tb &amp; tc)</p></th>\n<th class=\"head\"><p>Oper 2 (ta &amp; tb &amp; ~tc)</p></th>\n<th class=\"head\"><p>\u2026</p></th>\n<th class=\"head\"><p>Oper 254 (ta | tb | tc)</p></th>\n<th class=\"head\"><p>Oper 255 (True)</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td rowspan=\"8\"><p>\u2026</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-even\">\n<td colspan=\"3\"><p><strong>immLut</strong></p></td>\n<td><p><strong>0x0</strong></p></td>\n<td><p><strong>0x80</strong></p></td>\n<td><p><strong>0x40</strong></p></td>\n<td><p><strong>\u2026</strong></p></td>\n<td><p><strong>0xFE</strong></p></td>\n<td><p><strong>0xFF</strong></p></td>\n</tr>\n</tbody>\n</table>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>F = GetFunctionFromTable(immLut); // returns the function corresponding to immLut value\nd = F(a, b, c);\nif (BoolOp specified) {\n    p = (d != 0) BoolOp q;\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.3.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.BoolOp</span></code> qualifier introduced in PTX ISA version 8.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_50</span></code> or higher.</p>\n<p>Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.BoolOp</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>lop3.b32       d, a, b, c, 0x40;\nlop3.or.b32  d|p, a, b, c, 0x3f, q;\nlop3.and.b32 _|p, a, b, c, 0x3f, q;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Arbitrary logical operation on 3 inputs.\n\nSyntax\n\nlop3.b32 d, a, b, c, immLut;\n\nlop3.BoolOp.b32 d|p, a, b, c, immLut, q;\n\n.BoolOp   = { .or , .and };\n\nDescription\n\nCompute bitwise logical operation on inputs a, b, c and store the result in destination\n\nd.\n\nOptionally, .BoolOp can be specified to compute the predicate result p by performing a\n\nBoolean operation on the destination operand d with the predicate q in the following manner:\n\np = (d != 0) BoolOp q;\n\nThe sink symbol \u2018_\u2019 may be used in place of the destination operand d when .BoolOp qualifier\n\nis specified.\n\nThe logical operation is defined by a look-up table which, for 3 inputs, can be represented as an\n\n8-bit value specified by operand immLut as described below. immLut is an integer constant\n\nthat can take values from 0 to 255, thereby allowing up to 256 distinct logical operations on inputs\n\na, b, c.\n\nFor a logical operation F(a, b, c) the value of immLut can be computed by applying the same\n\noperation to three predefined constant values as follows:\n\nta = 0xF0;\n\ntb = 0xCC;\n\ntc = 0xAA;\n\nimmLut = F(ta, tb, tc);\n\nExamples:\n\nIf F = (a & b & c);\n\nimmLut = 0xF0 & 0xCC & 0xAA = 0x80\n\nIf F = (a | b | c);\n\nimmLut = 0xF0 | 0xCC | 0xAA = 0xFE\n\nIf F = (a & b & ~c);\n\nimmLut = 0xF0 & 0xCC & (~0xAA) = 0x40\n\nIf F = ((a & b | c) ^ a);\n\nimmLut = (0xF0 & 0xCC | 0xAA) ^ 0xF0 = 0x1A\n\nThe following table illustrates computation of immLut for various logical operations:\n\n\n\n\n\n\n\nta\n\ntb\n\ntc\n\nOper 0 (False)\n\nOper 1 (ta & tb & tc)\n\nOper 2 (ta & tb & ~tc)\n\n\u2026\n\nOper 254 (ta | tb | tc)\n\nOper 255 (True)\n\n\n\n0\n\n0\n\n0\n\n0\n\n0\n\n0\n\n\u2026\n\n0\n\n1\n\n0\n\n0\n\n1\n\n0\n\n0\n\n0\n\n1\n\n1\n\n0\n\n1\n\n0\n\n0\n\n0\n\n0\n\n1\n\n1\n\n0\n\n1\n\n1\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n0\n\n0\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n0\n\n1\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n1\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n1\n\n1\n\n1\n\n0\n\n1\n\n0\n\n1\n\n1\n\nimmLut\n\n0x0\n\n0x80\n\n0x40\n\n\u2026\n\n0xFE\n\n0xFF\n\nSemantics\n\nF = GetFunctionFromTable(immLut); // returns the function corresponding to immLut value\n\nd = F(a, b, c);\n\nif (BoolOp specified) {\n\n    p = (d != 0) BoolOp q;\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 4.3.\n\nSupport for .BoolOp qualifier introduced in PTX ISA version 8.2.\n\nTarget ISA Notes\n\nRequires sm_50 or higher.\n\nQualifier .BoolOp requires sm_70 or higher.\n\nExamples\n\nlop3.b32       d, a, b, c, 0x40;\n\nlop3.or.b32  d|p, a, b, c, 0x3f, q;\n\nlop3.and.b32 _|p, a, b, c, 0x3f, q;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3"
            };

        case "mad":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad\" target=\"_blank\" rel=\"noopener noreferrer\">mad(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mad\" target=\"_blank\" rel=\"noopener noreferrer\">mad(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc\" target=\"_blank\" rel=\"noopener noreferrer\">mad.cc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: mad</h1><section id=\"floating-point-instructions-mad\">\n\n\n<p>Multiply two values and add a third value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mad{.ftz}{.sat}.f32      d, a, b, c;    // .target sm_1x\nmad.rnd{.ftz}{.sat}.f32  d, a, b, c;    // .target sm_20\nmad.rnd.f64              d, a, b, c;    // .target sm_13 and higher\n\n.rnd = { .rn, .rz, .rm, .rp };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Multiplies two values and adds a third, and then writes the resulting value into a destination\nregister.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a*b + c;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span> <span class=\"pre\">sm_20</span></code> and higher:</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> computes the product of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to infinite precision and then adds <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to\nthis product, again in infinite precision. The resulting value is then rounded to single precision\nusing the rounding mode specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code>.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code> computes the product of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to infinite precision and then adds <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to\nthis product, again in infinite precision. The resulting value is then rounded to double precision\nusing the rounding mode specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code>.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.{f32,f64}</span></code> is the same as <code class=\"docutils literal notranslate\"><span class=\"pre\">fma.{f32,f64}</span></code>.</p></li>\n</ul>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span> <span class=\"pre\">sm_1x</span></code>:</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> computes the product of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> at double precision, and then the mantissa is\ntruncated to 23 bits, but the exponent is preserved. Note that this is different from computing\nthe product with <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code>, where the mantissa can be rounded and the exponent will be clamped. The\nexception for <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> is when <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span> <span class=\"pre\">=</span> <span class=\"pre\">+/-0.0</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> is identical to the result computed\nusing separate mul and add instructions. When JIT-compiled for SM 2.0 devices, <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> is\nimplemented as a fused multiply-add (i.e., <code class=\"docutils literal notranslate\"><span class=\"pre\">fma.rn.ftz.f32</span></code>). In this case, <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> can\nproduce slightly different numeric results and backward compatibility is not guaranteed in this\ncase.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code> computes the product of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to infinite precision and then adds <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to\nthis product, again in infinite precision. The resulting value is then rounded to double precision\nusing the rounding mode specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code>. Unlike <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code>, the treatment of subnormal\ninputs and output follows IEEE 754 standard.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code> is the same as <code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f64</span></code>.</p></li>\n</ul>\n<p>Rounding modifiers (no default):</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt>\n<dd>\n<p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>Saturation modifier:</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.sat.f32</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p>In PTX ISA versions 1.4 and later, a rounding modifier is required for <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code>.</p>\n<p>Legacy <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code> instructions having no rounding modifier will map to <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.rn.f64</span></code>.</p>\n<p>In PTX ISA versions 2.0 and later, a rounding modifier is required for <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> and higher targets.</p>\n<p><strong>Errata</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> requires a rounding modifier for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> and higher targets. However for PTX ISA\nversion 3.0 and earlier, ptxas does not enforce this requirement and <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> silently defaults\nto <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.rn.f32</span></code>. For PTX ISA version 3.1, ptxas generates a warning and defaults to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mad.rn.f32</span></code>, and in subsequent releases ptxas will enforce the requirement for PTX ISA version\n3.2 and later.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>Rounding modifiers have the following target requirements:</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>,<code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code>,<code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code>,<code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>,<code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code>,<code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code>,<code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p></li>\n</ul>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>@p  mad.f32  d,a,b,c;\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: mad</h1><section id=\"integer-arithmetic-instructions-mad\">\n\n\n<p>Multiply two values, optionally extract the high or low half of the intermediate result, and add a third value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mad.mode.type  d, a, b, c;\nmad.hi.sat.s32 d, a, b, c;\n\n.mode = { .hi, .lo, .wide };\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Multiplies two values, optionally extracts the high or low half of the intermediate result, and adds\na third value. Writes the result into a destination register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>t = a * b;\nn = bitwidth of type;\nd = t + c;           // for .wide\nd = t&lt;2n-1..n&gt; + c;  // for .hi variant\nd = t&lt;n-1..0&gt; + c;   // for .lo variant\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The type of the operation represents the types of the <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> operands. If .hi or .lo is\nspecified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> are the same size as <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and either the upper or lower\nhalf of the result is written to the destination register. If <code class=\"docutils literal notranslate\"><span class=\"pre\">.wide</span></code> is specified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> are twice as wide as <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to receive the result of the multiplication.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.wide</span></code> suffix is supported only for 16-bit and 32-bit integer types.</p>\n<p>Saturation modifier:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code></dt>\n<dd>\n<p>limits result to <code class=\"docutils literal notranslate\"><span class=\"pre\">MININT..MAXINT</span></code> (no overflow) for the size of the operation.</p>\n<p>Applies only to <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> type in <code class=\"docutils literal notranslate\"><span class=\"pre\">.hi</span></code> mode.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>@p  mad.lo.s32 d,a,b,c;\n    mad.lo.s32 r,p,q,r;\n</pre></div>\n</div>\n</section>\n<h1>Extended-Precision Arithmetic Instructions: mad.cc</h1><section id=\"extended-precision-arithmetic-instructions-mad-cc\">\n\n\n<p>Multiply two values, extract high or low half of result, and add a third value with carry-out.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mad{.hi,.lo}.cc.type  d, a, b, c;\n\n.type = { .u32, .s32, .u64, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Multiplies two values, extracts either the high or low part of the result, and adds a third\nvalue. Writes the result to the destination register and the carry-out from the addition into the\ncondition code register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>t = a * b;\nd = t&lt;63..32&gt; + c;    // for .hi variant\nd = t&lt;31..0&gt; + c;     // for .lo variant\n</pre></div>\n</div>\n<p>carry-out from addition is written to <code class=\"docutils literal notranslate\"><span class=\"pre\">CC.CF</span></code></p>\n<p><strong>Notes</strong></p>\n<p>Generally used in combination with <code class=\"docutils literal notranslate\"><span class=\"pre\">madc</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">addc</span></code> to implement extended-precision multi-word\nmultiplication. See <code class=\"docutils literal notranslate\"><span class=\"pre\">madc</span></code> for an example.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.cc</span></code> introduced in PTX ISA version 3.0.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.cc</span></code> introduced in PTX ISA version 4.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>@p  mad.lo.cc.u32 d,a,b,c;\n    mad.lo.cc.u32 r,p,q,r;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: mad\n\n\n\nMultiply two values and add a third value.\n\nSyntax\n\nmad{.ftz}{.sat}.f32      d, a, b, c;    // .target sm_1x\n\nmad.rnd{.ftz}{.sat}.f32  d, a, b, c;    // .target sm_20\n\nmad.rnd.f64              d, a, b, c;    // .target sm_13 and higher\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nMultiplies two values and adds a third, and then writes the resulting value into a destination\n\nregister.\n\nSemantics\n\nd = a*b + ...\n\n=====Integer Arithmetic Instructions: mad\n\n\n\nMultiply two values, optionally extract the high or low half of the intermediate result, and add a third value.\n\nSyntax\n\nmad.mode.type  d, a, b, c;\n\nmad.hi.sat.s32 d, a, b, c;\n\n.mode = { .hi, .lo, .wide };\n\n.type = { .u16, .u32, .u64,\n\n          .s16, .s32, .s64 };\n\nDescription\n\nMultiplies two values, optionally extracts the high or low half of the intermediate result, and adds\n\na third value. Writes the r...\n\n=====Extended-Precision Arithmetic Instructions: mad.cc\n\n\n\nMultiply two values, extract high or low half of result, and add a third value with carry-out.\n\nSyntax\n\nmad{.hi,.lo}.cc.type  d, a, b, c;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nMultiplies two values, extracts either the high or low part of the result, and adds a third\n\nvalue. Writes the result to the destination register and the carry-out from the addition into the\n\ncondition code register.\n\nS... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad"
            };

        case "mad24":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mad24\" target=\"_blank\" rel=\"noopener noreferrer\">mad24(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: mad24</h1><section id=\"integer-arithmetic-instructions-mad24\">\n\n\n<p>Multiply two 24-bit integer values and add a third value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mad24.mode.type  d, a, b, c;\nmad24.hi.sat.s32 d, a, b, c;\n\n.mode = { .hi, .lo };\n.type = { .u32, .s32 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the product of two 24-bit integer values held in 32-bit source registers, and add a third,\n32-bit value to either the high or low 32-bits of the 48-bit result. Return either the high or low\n32-bits of the 48-bit result.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>t = a * b;\nd = t&lt;47..16&gt; + c;   // for .hi variant\nd = t&lt;31..0&gt; + c;    // for .lo variant\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Integer multiplication yields a result that is twice the size of the input operands, i.e., 48-bits.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad24.hi</span></code> performs a 24x24-bit multiply and adds the high 32 bits of the 48-bit result to a third\nvalue.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad24.lo</span></code> performs a 24x24-bit multiply and adds the low 32 bits of the 48-bit result to a third\nvalue.</p>\n<p>All operands are of the same type and size.</p>\n<p>Saturation modifier:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code></dt>\n<dd>\n<p>limits result of 32-bit signed addition to <code class=\"docutils literal notranslate\"><span class=\"pre\">MININT..MAXINT</span></code> (no overflow). Applies only to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> type in .hi mode.</p>\n</dd>\n</dl>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad24.hi</span></code> may be less efficient on machines without hardware support for 24-bit multiply.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mad24.lo.s32 d,a,b,c;   // low 32-bits of 24x24-bit signed multiply.\n</pre></div>\n</div>\n</section>",
                "tooltip": "Multiply two 24-bit integer values and add a third value.\n\nSyntax\n\nmad24.mode.type  d, a, b, c;\n\nmad24.hi.sat.s32 d, a, b, c;\n\n.mode = { .hi, .lo };\n\n.type = { .u32, .s32 };\n\nDescription\n\nCompute the product of two 24-bit integer values held in 32-bit source registers, and add a third,\n\n32-bit value to either the high or low 32-bits of the 48-bit result. Return either the high or low\n\n32-bits of the 48-bit result.\n\nSemantics\n\nt = a * b;\n\nd = t<47..16> + c;   // for .hi variant\n\nd = t<31..0> + c;    // for .lo variant\n\nNotes\n\nInteger multiplication yields a result that is twice the size of the input operands, i.e., 48-bits.\n\nmad24.hi performs a 24x24-bit multiply and adds the high 32 bits of the 48-bit result to a third\n\nvalue.\n\nmad24.lo performs a 24x24-bit multiply and adds the low 32 bits of the 48-bit result to a third\n\nvalue.\n\nAll operands are of the same type and size.\n\nSaturation modifier:\n\n.sat\n\nlimits result of 32-bit signed addition to MININT..MAXINT (no overflow). Applies only to\n\n.s32 type in .hi mode.\n\nmad24.hi may be less efficient on machines without hardware support for 24-bit multiply.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmad24.lo.s32 d,a,b,c;   // low 32-bits of 24x24-bit signed multiply.\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mad24"
            };

        case "madc":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc\" target=\"_blank\" rel=\"noopener noreferrer\">madc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Extended-Precision Arithmetic Instructions: madc</h1><section id=\"extended-precision-arithmetic-instructions-madc\">\n\n\n<p>Multiply two values, extract high or low half of result, and add a third value with carry-in and\noptional carry-out.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>madc{.hi,.lo}{.cc}.type  d, a, b, c;\n\n.type = { .u32, .s32, .u64, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Multiplies two values, extracts either the high or low part of the result, and adds a third value\nalong with carry-in. Writes the result to the destination register and optionally writes the\ncarry-out from the addition into the condition code register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>t = a * b;\nd = t&lt;63..32&gt; + c + CC.CF;     // for .hi variant\nd = t&lt;31..0&gt; + c + CC.CF;      // for .lo variant\n</pre></div>\n</div>\n<p>if <code class=\"docutils literal notranslate\"><span class=\"pre\">.cc</span></code> specified, carry-out from addition is written to <code class=\"docutils literal notranslate\"><span class=\"pre\">CC.CF</span></code></p>\n<p><strong>Notes</strong></p>\n<p>Generally used in combination with <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.cc</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">addc</span></code> to implement extended-precision\nmulti-word multiplication. See example below.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">madc</span></code> introduced in PTX ISA version 3.0.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">madc</span></code> introduced in PTX ISA version 4.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// extended-precision multiply:  [r3,r2,r1,r0] = [r5,r4] * [r7,r6]\nmul.lo.u32     r0,r4,r6;      // r0=(r4*r6).[31:0], no carry-out\nmul.hi.u32     r1,r4,r6;      // r1=(r4*r6).[63:32], no carry-out\nmad.lo.cc.u32  r1,r5,r6,r1;   // r1+=(r5*r6).[31:0], may carry-out\nmadc.hi.u32    r2,r5,r6,0;    // r2 =(r5*r6).[63:32]+carry-in,\n                              // no carry-out\nmad.lo.cc.u32   r1,r4,r7,r1;  // r1+=(r4*r7).[31:0], may carry-out\nmadc.hi.cc.u32  r2,r4,r7,r2;  // r2+=(r4*r7).[63:32]+carry-in,\n                              // may carry-out\naddc.u32        r3,0,0;       // r3 = carry-in, no carry-out\nmad.lo.cc.u32   r2,r5,r7,r2;  // r2+=(r5*r7).[31:0], may carry-out\nmadc.hi.u32     r3,r5,r7,r3;  // r3+=(r5*r7).[63:32]+carry-in\n</pre></div>\n</div>\n</section>",
                "tooltip": "Multiply two values, extract high or low half of result, and add a third value with carry-in and\n\noptional carry-out.\n\nSyntax\n\nmadc{.hi,.lo}{.cc}.type  d, a, b, c;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nMultiplies two values, extracts either the high or low part of the result, and adds a third value\n\nalong with carry-in. Writes the result to the destination register and optionally writes the\n\ncarry-out from the addition into the condition code register.\n\nSemantics\n\nt = a * b;\n\nd = t<63..32> + c + CC.CF;     // for .hi variant\n\nd = t<31..0> + c + CC.CF;      // for .lo variant\n\nif .cc specified, carry-out from addition is written to CC.CF\n\nNotes\n\nGenerally used in combination with mad.cc and addc to implement extended-precision\n\nmulti-word multiplication. See example below.\n\nPTX ISA Notes\n\n32-bit madc introduced in PTX ISA version 3.0.\n\n64-bit madc introduced in PTX ISA version 4.3.\n\nTarget ISA Notes\n\nRequires target sm_20 or higher.\n\nExamples\n\n// extended-precision multiply:  [r3,r2,r1,r0] = [r5,r4] * [r7,r6]\n\nmul.lo.u32     r0,r4,r6;      // r0=(r4*r6).[31:0], no carry-out\n\nmul.hi.u32     r1,r4,r6;      // r1=(r4*r6).[63:32], no carry-out\n\nmad.lo.cc.u32  r1,r5,r6,r1;   // r1+=(r5*r6).[31:0], may carry-out\n\nmadc.hi.u32    r2,r5,r6,0;    // r2 =(r5*r6).[63:32]+carry-in,\n\n                              // no carry-out\n\nmad.lo.cc.u32   r1,r4,r7,r1;  // r1+=(r4*r7).[31:0], may carry-out\n\nmadc.hi.cc.u32  r2,r4,r7,r2;  // r2+=(r4*r7).[63:32]+carry-in,\n\n                              // may carry-out\n\naddc.u32        r3,0,0;       // r3 = carry-in, no carry-out\n\nmad.lo.cc.u32   r2,r5,r7,r2;  // r2+=(r5*r7).[31:0], may carry-out\n\nmadc.hi.u32     r3,r5,r7,r3;  // r3+=(r5*r7).[63:32]+carry-in\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc"
            };

        case "mapa":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa\" target=\"_blank\" rel=\"noopener noreferrer\">mapa <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: mapa</h1><section id=\"data-movement-and-conversion-instructions-mapa\">\n\n\n<p>Map the address of the shared variable in the target CTA.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mapa{.space}.type          d, a, b;\n\n// Maps shared memory address in register a into CTA b.\nmapa.shared::cluster.type  d, a, b;\n\n// Maps shared memory variable into CTA b.\nmapa.shared::cluster.type  d, sh, b;\n\n// Maps shared memory variable into CTA b.\nmapa.shared::cluster.type  d, sh + imm, b;\n\n// Maps generic address in register a into CTA b.\nmapa.type                  d, a, b;\n\n.space = { .shared::cluster }\n.type  = { .u32, .u64 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Get address in the CTA specified by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> which corresponds to the address specified by\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>Instruction type <code class=\"docutils literal notranslate\"><span class=\"pre\">.type</span></code> indicates the type of the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and the source\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>When space is <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code>, source <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is either a shared memory variable or a register\ncontaining a valid shared memory address and register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> contains a shared memory address. When\nthe optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.space</span></code> is not specified, both <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> are registers containing\ngeneric addresses pointing to shared memory.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is a 32-bit integer operand representing the rank of the target CTA.</p>\n<p>Destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> will hold an address in CTA <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> corresponding to operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mapa.shared::cluster.u64 d1, %reg1, cta;\nmapa.shared::cluster.u32 d2, sh, 3;\nmapa.u64                 d3, %reg2, cta;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Map the address of the shared variable in the target CTA.\n\nSyntax\n\nmapa{.space}.type          d, a, b;\n\n// Maps shared memory address in register a into CTA b.\n\nmapa.shared::cluster.type  d, a, b;\n\n// Maps shared memory variable into CTA b.\n\nmapa.shared::cluster.type  d, sh, b;\n\n// Maps shared memory variable into CTA b.\n\nmapa.shared::cluster.type  d, sh + imm, b;\n\n// Maps generic address in register a into CTA b.\n\nmapa.type                  d, a, b;\n\n.space = { .shared::cluster }\n\n.type  = { .u32, .u64 }\n\nDescription\n\nGet address in the CTA specified by operand b which corresponds to the address specified by\n\noperand a.\n\nInstruction type .type indicates the type of the destination operand d and the source\n\noperand a.\n\nWhen space is .shared::cluster, source a is either a shared memory variable or a register\n\ncontaining a valid shared memory address and register d contains a shared memory address. When\n\nthe optional qualifier .space is not specified, both a and d are registers containing\n\ngeneric addresses pointing to shared memory.\n\nb is a 32-bit integer operand representing the rank of the target CTA.\n\nDestination register d will hold an address in CTA b corresponding to operand a.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\nmapa.shared::cluster.u64 d1, %reg1, cta;\n\nmapa.shared::cluster.u32 d2, sh, 3;\n\nmapa.u64                 d3, %reg2, cta;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa"
            };

        case "match":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync\" target=\"_blank\" rel=\"noopener noreferrer\">match.sync <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: match.sync</h1><section id=\"parallel-synchronization-and-communication-instructions-match-sync\">\n\n\n<p>Broadcast and compare a value across threads in warp.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>match.any.sync.type  d, a, membermask;\nmatch.all.sync.type  d[|p], a, membermask;\n\n.type = { .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">match.sync</span></code> will cause executing thread to wait until all non-exited threads from <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>\nhave executed <code class=\"docutils literal notranslate\"><span class=\"pre\">match.sync</span></code> with the same qualifiers and same <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> value before resuming\nexecution.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> specifies a 32-bit integer which is a mask indicating threads participating\nin this instruction where the bit position corresponds to thread\u2019s laneid.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">match.sync</span></code> performs broadcast and compare of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> across all non-exited threads in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> and sets destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and optional predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> based on mode.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> has instruction type and <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p>Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is a 32-bit mask where bit position in mask corresponds to thread\u2019s laneid.</p>\n<p>The matching operation modes are:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.all</span></code></dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is set to mask corresponding to non-exited threads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> if all non-exited\nthreads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> have same value of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>; otherwise <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is set\nto 0. Optionally predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to true if all non-exited threads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> have\nsame value of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>; otherwise <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to false. The sink symbol \u2018_\u2019 may be used in\nplace of any one of the destination operands.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.any</span></code></dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is set to mask of non-exited threads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> that have same value of operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n</dd>\n</dl>\n<p>The behavior of <code class=\"docutils literal notranslate\"><span class=\"pre\">match.sync</span></code> is undefined if the executing thread is not in the <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><strong>Release Notes</strong></p>\n<p>Note that <code class=\"docutils literal notranslate\"><span class=\"pre\">match.sync</span></code> applies to threads in a single warp, not across an entire CTA.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>match.any.sync.b32    d, a, 0xffffffff;\nmatch.all.sync.b64    d|p, a, mask;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Broadcast and compare a value across threads in warp.\n\nSyntax\n\nmatch.any.sync.type  d, a, membermask;\n\nmatch.all.sync.type  d[|p], a, membermask;\n\n.type = { .b32, .b64 };\n\nDescription\n\nmatch.sync will cause executing thread to wait until all non-exited threads from membermask\n\nhave executed match.sync with the same qualifiers and same membermask value before resuming\n\nexecution.\n\nOperand membermask specifies a 32-bit integer which is a mask indicating threads participating\n\nin this instruction where the bit position corresponds to thread\u2019s laneid.\n\nmatch.sync performs broadcast and compare of operand a across all non-exited threads in\n\nmembermask and sets destination d and optional predicate p based on mode.\n\nOperand a has instruction type and d has .b32 type.\n\nDestination d is a 32-bit mask where bit position in mask corresponds to thread\u2019s laneid.\n\nThe matching operation modes are:\n\n.all\n\nd is set to mask corresponding to non-exited threads in membermask if all non-exited\n\nthreads in membermask have same value of operand a; otherwise d is set\n\nto 0. Optionally predicate p is set to true if all non-exited threads in membermask have\n\nsame value of operand a; otherwise p is set to false. The sink symbol \u2018_\u2019 may be used in\n\nplace of any one of the destination operands.\n\n.any\n\nd is set to mask of non-exited threads in membermask that have same value of operand\n\na.\n\nThe behavior of match.sync is undefined if the executing thread is not in the membermask.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.0.\n\nTarget ISA Notes\n\nRequires sm_70 or higher.\n\nRelease Notes\n\nNote that match.sync applies to threads in a single warp, not across an entire CTA.\n\nExamples\n\nmatch.any.sync.b32    d, a, 0xffffffff;\n\nmatch.all.sync.b64    d|p, a, mask;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync"
            };

        case "max":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max\" target=\"_blank\" rel=\"noopener noreferrer\">max(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max\" target=\"_blank\" rel=\"noopener noreferrer\">max(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max\" target=\"_blank\" rel=\"noopener noreferrer\">max(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: max</h1><section id=\"floating-point-instructions-max\">\n\n\n<p>Find the maximum of two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>max{.ftz}{.NaN}{.xorsign.abs}.f32  d, a, b;\nmax.f64                            d, a, b;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Store the maximum of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.NaN</span></code> modifier is specified, the result is canonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> if either of the inputs is\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifier is specified, the magnitude of destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is the maximum of\nabsolute values of both the input arguments.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> modifier is specified, the sign bit of destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is equal to the XOR of the\nsign bits of both the inputs.</p>\n<p>Modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> must be specified together and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> considers the sign\nbit of both inputs before applying <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> operation.</p>\n<p>If the result of <code class=\"docutils literal notranslate\"><span class=\"pre\">max</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> then the <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifiers will be ignored.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (.xorsign) {\n    xorsign = getSignBit(a) ^ getSignBit(b);\n    if (.abs) {\n        a = |a|;\n        b = |b|;\n    }\n}\nif (isNaN(a) &amp;&amp; isNaN(b))                 d = NaN;\nelse if (.NaN &amp;&amp; (isNaN(a) || isNaN(b)))  d = NaN;\nelse if (isNaN(a))                        d = b;\nelse if (isNaN(b))                        d = a;\nelse                                      d = (a &gt; b) ? a : b;\nif (.xorsign &amp;&amp; !isNaN(d)) {\n    setSignBit(d, xorsign);\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>If values of both inputs are 0.0, then +0.0 &gt; -0.0.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.NaN</span></code>introduced in PTX ISA version 7.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.xorsign.abs</span></code> introduced in PTX ISA version 7.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.NaN</span></code>requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.xorsign.abs</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_86</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>max.ftz.f32  f0,f1,f2;\nmax.f64      a,b,c;\n// fp32 max with .NaN\nmax.NaN.f32  f0,f1,f2;\n// fp32 max with .xorsign.abs\nmax.xorsign.abs.f32 Rd, Ra, Rb;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: max</h1><section id=\"half-precision-floating-point-instructions-max\">\n\n\n<p>Find the maximum of two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>max{.ftz}{.NaN}{.xorsign.abs}.f16      d, a, b;\nmax{.ftz}{.NaN}{.xorsign.abs}.f16x2    d, a, b;\nmax{.NaN}{.xorsign.abs}.bf16           d, a, b;\nmax{.NaN}{.xorsign.abs}.bf16x2         d, a, b;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Store the maximum of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction types, input vectors are formed with half-word values\nfrom source operands. Half-word operands are then processed in parallel to store <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> result in destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction\ntype, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.NaN</span></code> modifier is specified, the result is canonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> if either of the inputs is\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifier is specified, the magnitude of destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is the maximum of\nabsolute values of both the input arguments.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> modifier is specified, the sign bit of destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is equal to the XOR of the\nsign bits of both the inputs.</p>\n<p>Modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> must be specified together and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> considers the sign\nbit of both inputs before applying <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> operation.</p>\n<p>If the result of <code class=\"docutils literal notranslate\"><span class=\"pre\">max</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> then the <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifiers will be ignored.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    if (.xorsign) {\n        xorsign = getSignBit(a) ^ getSignBit(b);\n        if (.abs) {\n            a = |a|;\n            b = |b|;\n        }\n    }\n    if (isNaN(a) &amp;&amp; isNaN(b))              d = NaN;\n    if (.NaN &amp;&amp; (isNaN(a) || isNaN(b)))    d = NaN;\n    else if (isNaN(a))                     d = b;\n    else if (isNaN(b))                     d = a;\n    else                                   d = (a &gt; b) ? a : b;\n    if (.xorsign &amp;&amp; !isNaN(d)) {\n         setSignBit(d, xorsign);\n    }\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n        if (.xorsign) {\n            xorsign = getSignBit(fA[i]) ^ getSignBit(fB[i]);\n            if (.abs) {\n                fA[i] = |fA[i]|;\n                fB[i] = |fB[i]|;\n            }\n        }\n        if (isNaN(fA[i]) &amp;&amp; isNaN(fB[i]))              d[i] = NaN;\n        if (.NaN &amp;&amp; (isNaN(fA[i]) || isNaN(fB[i])))    d[i] = NaN;\n        else if (isNaN(fA[i]))                         d[i] = fB[i];\n        else if (isNaN(fB[i]))                         d[i] = fA[i];\n        else                                           d[i] = (fA[i] &gt; fB[i]) ? fA[i] : fB[i];\n        if (.xorsign &amp;&amp; !isNaN(fA[i])) {\n            setSignBit(d[i], xorsign);\n        }\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<dl class=\"simple\">\n<dt>Subnormal numbers:</dt>\n<dd>\n<p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">max.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>If values of both inputs are 0.0, then +0.0 &gt; -0.0.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.xorsign.abs</span></code> introduced in PTX ISA version 7.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.xorsign.abs</span></code> support requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_86</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>max.ftz.f16       h0,h1,h2;\nmax.f16x2         b0,b1,b2;\n// SIMD fp16 max with NaN\nmax.NaN.f16x2     b0,b1,b2;\n// scalar f16 max with xorsign.abs\nmax.xorsign.abs.f16 Rd, Ra, Rb;\nmax.bf16          h0, h1, h2;\n// scalar bf16 max and NaN\nmax.NaN.bf16x2    b0, b1, b2;\n// SIMD bf16 max with xorsign.abs\nmax.xorsign.abs.bf16x2 Rd, Ra, Rb;\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: max</h1><section id=\"integer-arithmetic-instructions-max\">\n\n\n<p>Find the maximum of two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>max.atype         d, a, b;\nmax{.relu}.btype  d, a, b;\n\n.atype = { .u16, .u32, .u64,\n           .u16x2, .s16, .s64 };\n.btype = { .s16x2, .s32 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Store the maximum of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code> instruction types, forms input vectors by half word values from source\noperands. Half-word operands are then processed in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code> result\nin destination.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have the same type as the instruction type. For instruction types\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code>, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (type == u16x2 || type == s16x2) {\n    iA[0] = a[0:15];\n    iA[1] = a[16:31];\n    iB[0] = b[0:15];\n    iB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = (iA[i] &gt; iB[i]) ? iA[i] : iB[i];\n    }\n} else {\n    d = (a &gt; b) ? a : b; // Integer (signed and unsigned)\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Signed and unsigned differ.</p>\n<dl class=\"simple\">\n<dt>Saturation modifier:</dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.relu.{s16x2,</span> <span class=\"pre\">s32}</span></code> clamps the result to 0 if negative.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">max{.relu}.s16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">max.relu.s32</span></code> introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">max{.relu}.s16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">max.relu.s32</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>max.u32  d,a,b;\nmax.s32  q,q,0;\nmax.relu.s16x2 t,t,u;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: max\n\n\n\nFind the maximum of two values.\n\nSyntax\n\nmax{.ftz}{.NaN}{.xorsign.abs}.f32  d, a, b;\n\nmax.f64                            d, a, b;\n\nDescription\n\nStore the maximum of a and b in d.\n\nIf .NaN modifier is specified, the result is canonical NaN if either of the inputs is\n\nNaN.\n\nIf .abs modifier is specified, the magnitude of destination operand d is the maximum of\n\nabsolute values of both the input arguments.\n\nIf...\n\n=====Half Precision Floating Point Instructions: max\n\n\n\nFind the maximum of two values.\n\nSyntax\n\nmax{.ftz}{.NaN}{.xorsign.abs}.f16      d, a, b;\n\nmax{.ftz}{.NaN}{.xorsign.abs}.f16x2    d, a, b;\n\nmax{.NaN}{.xorsign.abs}.bf16           d, a, b;\n\nmax{.NaN}{.xorsign.abs}.bf16x2         d, a, b;\n\nDescription\n\nStore the maximum of a and b in d.\n\nFor .f16x2 and .bf16x2 instruction types, input vectors are formed with half-word values\n\nfrom source operands. Half-word o...\n\n=====Integer Arithmetic Instructions: max\n\n\n\nFind the maximum of two values.\n\nSyntax\n\nmax.atype         d, a, b;\n\nmax{.relu}.btype  d, a, b;\n\n.atype = { .u16, .u32, .u64,\n\n           .u16x2, .s16, .s64 };\n\n.btype = { .s16x2, .s32 };\n\nDescription\n\nStore the maximum of a and b in d.\n\nFor .u16x2, .s16x2 instruction types, forms input vectors by half word values from source\n\noperands. Half-word operands are then processed in parallel to produce .u16x2, .s... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max"
            };

        case "maxclusterrank":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank\" target=\"_blank\" rel=\"noopener noreferrer\">maxclusterrank <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Cluster Dimension Directives: .maxclusterrank</h1><section id=\"cluster-dimension-directives-maxclusterrank\">\n\n\n<p>Declare the maximum number of CTAs that can be part of the cluster.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.maxclusterrank n\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declare the maximum number of thread blocks (CTAs) allowed to be part of the cluster.</p>\n<p><strong>Semantics</strong></p>\n<p>Product of the number of CTAs in each cluster dimension specified in any invocation of the kernel is\nrequired to be less or equal to that specified in this directive. Otherwise invocation will result\nin a runtime error or kernel launch failure.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxclusterrank</span></code> directive cannot be used in conjunction with the <code class=\"docutils literal notranslate\"><span class=\"pre\">.reqnctapercluster</span></code> directive.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.entry foo ..maxclusterrank 8         { . . . }\n</pre></div>\n</div>\n</section>",
                "tooltip": "Declare the maximum number of CTAs that can be part of the cluster.\n\nSyntax\n\n.maxclusterrank n\n\nDescription\n\nDeclare the maximum number of thread blocks (CTAs) allowed to be part of the cluster.\n\nSemantics\n\nProduct of the number of CTAs in each cluster dimension specified in any invocation of the kernel is\n\nrequired to be less or equal to that specified in this directive. Otherwise invocation will result\n\nin a runtime error or kernel launch failure.\n\nThe .maxclusterrank directive cannot be used in conjunction with the .reqnctapercluster directive.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.entry foo ..maxclusterrank 8         { . . . }\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank"
            };

        case "maxnreg":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-maxnreg\" target=\"_blank\" rel=\"noopener noreferrer\">maxnreg <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Performance-Tuning Directives: .maxnreg</h1><section id=\"performance-tuning-directives-maxnreg\">\n\n\n<p>Maximum number of registers that can be allocated per thread.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.maxnreg n\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declare the maximum number of registers per thread in a CTA.</p>\n<p><strong>Semantics</strong></p>\n<p>The compiler guarantees that this limit will not be exceeded. The actual number of registers used\nmay be less; for example, the backend may be able to compile to fewer registers, or the maximum\nnumber of registers may be further constrained by <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxntid</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxctapersm</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.entry foo .maxnreg 16 { ... }  // max regs per thread = 16\n</pre></div>\n</div>\n</section>",
                "tooltip": "Maximum number of registers that can be allocated per thread.\n\nSyntax\n\n.maxnreg n\n\nDescription\n\nDeclare the maximum number of registers per thread in a CTA.\n\nSemantics\n\nThe compiler guarantees that this limit will not be exceeded. The actual number of registers used\n\nmay be less; for example, the backend may be able to compile to fewer registers, or the maximum\n\nnumber of registers may be further constrained by .maxntid and .maxctapersm.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry foo .maxnreg 16 { ... }  // max regs per thread = 16\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-maxnreg"
            };

        case "maxntid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-maxntid\" target=\"_blank\" rel=\"noopener noreferrer\">maxntid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Performance-Tuning Directives: .maxntid</h1><section id=\"performance-tuning-directives-maxntid\">\n\n\n<p>Maximum number of threads in the thread block (CTA).</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.maxntid nx\n.maxntid nx, ny\n.maxntid nx, ny, nz\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declare the maximum number of threads in the thread block (CTA). This maximum is specified by giving\nthe maximum extent of each dimension of the 1D, 2D, or 3D CTA.\u00a0 The maximum number of threads is the\nproduct of the maximum extent in each dimension.</p>\n<p><strong>Semantics</strong></p>\n<p>The maximum number of threads in the thread block, computed as the product of the maximum extent\nspecified for each dimension, is guaranteed not to be exceeded in any invocation of the kernel in\nwhich this directive appears. Exceeding the maximum number of threads results in a runtime error or\nkernel launch failure.</p>\n<p>Note that this directive guarantees that the <em>total</em> number of threads does not exceed the maximum,\nbut does not guarantee that the limit in any particular dimension is not exceeded.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.entry foo .maxntid 256       { ... }  // max threads = 256\n.entry bar .maxntid 16,16,4   { ... }  // max threads = 1024\n</pre></div>\n</div>\n</section>",
                "tooltip": "Maximum number of threads in the thread block (CTA).\n\nSyntax\n\n.maxntid nx\n\n.maxntid nx, ny\n\n.maxntid nx, ny, nz\n\nDescription\n\nDeclare the maximum number of threads in the thread block (CTA). This maximum is specified by giving\n\nthe maximum extent of each dimension of the 1D, 2D, or 3D CTA.\u00a0 The maximum number of threads is the\n\nproduct of the maximum extent in each dimension.\n\nSemantics\n\nThe maximum number of threads in the thread block, computed as the product of the maximum extent\n\nspecified for each dimension, is guaranteed not to be exceeded in any invocation of the kernel in\n\nwhich this directive appears. Exceeding the maximum number of threads results in a runtime error or\n\nkernel launch failure.\n\nNote that this directive guarantees that the total number of threads does not exceed the maximum,\n\nbut does not guarantee that the limit in any particular dimension is not exceeded.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry foo .maxntid 256       { ... }  // max threads = 256\n\n.entry bar .maxntid 16,16,4   { ... }  // max threads = 1024\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-maxntid"
            };

        case "mbarrier":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.arrive <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.arrive_drop <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.complete_tx <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.expect_tx <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.init <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.inval <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.pending_count <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.test_wait/mbarrier.try_wait <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: mbarrier</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier\">\n\n\n<ul class=\"simple\">\n<li><p>Synchronizing any subset of threads within a CTA</p></li>\n<li><p>One-way synchronization of threads across CTAs of a cluster. As noted in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-smem\">mbarrier support with\nshared memory</a>, threads can\nperform only <em>arrive</em> operations but not <em>*_wait</em> on an mbarrier located in <code class=\"docutils literal notranslate\"><span class=\"pre\">shared::cluster</span></code>\nspace.</p></li>\n<li><p>Waiting for completion of asynchronous memory operations initiated by a thread and making them\nvisible to other threads.</p></li>\n</ul>\n<p>An <em>mbarrier object</em> is an opaque object in memory which can be initialized and invalidated using :</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.inval</span></code></p></li>\n</ul>\n<p>Operations supported on <em>mbarrier object</em>s are :</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.expect_tx</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.complete_tx</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.pending_count</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.mbarrier.arrive</span></code></p></li>\n</ul>\n<p>Performing any <em>mbarrier</em> operation except <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code> on an uninitialized <em>mbarrier object</em>\nresults in undefined behavior.</p>\n<p>Unlike <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions which can access a limited number of barriers\nper CTA, <em>mbarrier objects</em> are used defined and are only limited by the total shared memory size\navailable.</p>\n<p><em>mbarrier</em> operations enable threads to perform useful work after the arrival at the <em>mbarrier</em> and\nbefore waiting for the <em>mbarrier</em> to complete.</p>\n<section id=\"size-and-alignment-of-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\"></span><h5>\n<span class=\"section-number\">9.7.14.15.1. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size and alignment of mbarrier object</a><a class=\"headerlink\" href=\"#size-and-alignment-of-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p>An mbarrier object is an opaque object with the following type and alignment requirements :</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 23%\"/>\n<col style=\"width: 44%\"/>\n<col style=\"width: 33%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>Type</p></th>\n<th class=\"head\"><p>Alignment (bytes)</p></th>\n<th class=\"head\"><p>Memory space</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code></p></td>\n<td><p>8</p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code></p></td>\n</tr>\n</tbody>\n</table>\n</section>\n<section id=\"contents-of-the-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-contents\"></span><h5>\n<span class=\"section-number\">9.7.14.15.2. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier object</a><a class=\"headerlink\" href=\"#contents-of-the-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p>An opaque <em>mbarrier object</em> keeps track of the following information :</p>\n<ul class=\"simple\">\n<li><p>Current phase of the <em>mbarrier object</em></p></li>\n<li><p>Count of pending arrivals for the current phase of the <em>mbarrier object</em></p></li>\n<li><p>Count of expected arrivals for the next phase of the <em>mbarrier object</em></p></li>\n<li><p>Count of pending asynchronous memory operations (or transactions) tracked by the current phase of\nthe <em>mbarrier object</em>. This is also referred to as <em>tx-count</em>.</p></li>\n</ul>\n<p>An <em>mbarrier object</em> progresses through a sequence of phases where each phase is defined by threads\nperforming an expected number of <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperations.</p>\n<p>The valid range of each of the counts is as shown below:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 38%\"/>\n<col style=\"width: 33%\"/>\n<col style=\"width: 30%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>Count name</p></th>\n<th class=\"head\"><p>Minimum value</p></th>\n<th class=\"head\"><p>Maximum value</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p>Expected arrival count</p></td>\n<td><p>1</p></td>\n<td><p>2<sup>20</sup> - 1</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>Pending arrival count</p></td>\n<td><p>0</p></td>\n<td><p>2<sup>20</sup> - 1</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p>tx-count</p></td>\n<td><p>-(2<sup>20</sup> - 1)</p></td>\n<td><p>2<sup>20</sup> - 1</p></td>\n</tr>\n</tbody>\n</table>\n</section>\n<section id=\"lifecycle-of-the-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-lifecycle\"></span><h5>\n<span class=\"section-number\">9.7.14.15.3. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-lifecycle\">Lifecycle of the mbarrier object</a><a class=\"headerlink\" href=\"#lifecycle-of-the-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p>The <em>mbarrier object</em> must be initialized prior to use.</p>\n<p>An <em>mbarrier object</em> is used to synchronize threads and asynchronous memory operations.</p>\n<p>An <em>mbarrier object</em> may be used to perform a sequence of such synchronizations.</p>\n<p>An <em>mbarrier object</em> must be invalidated to repurpose its memory.</p>\n</section>\n<section id=\"phase-of-the-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-phase\"></span><h5>\n<span class=\"section-number\">9.7.14.15.4. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-phase\">Phase of the mbarrier object</a><a class=\"headerlink\" href=\"#phase-of-the-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p>The phase of an <em>mbarrier object</em> is the number of times the <em>mbarrier object</em> has been used to\nsynchronize threads and <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a>\noperations. In each phase {0, 1, 2, \u2026}, threads perform in program order :</p>\n<ul class=\"simple\">\n<li><p><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperations to complete the current phase and</p></li>\n<li><p><em>test_wait</em> / <em>try_wait</em> operations to check for the completion of the current phase.</p></li>\n</ul>\n<p>An <em>mbarrier object</em> is automatically reinitialized upon completion of the current phase for\nimmediate use in the next phase. The current phase is incomplete and all prior phases are complete.</p>\n<p>For each phase of the mbarrier object, at least one <em>test_wait</em> or <em>try_wait</em> operation must be\nperformed which returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">waitComplete</span></code> before an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a> operation\nin the subsequent phase.</p>\n</section>\n<section id=\"tracking-asynchronous-operations-by-the-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-tracking-async-operations\"></span><h5>\n<span class=\"section-number\">9.7.14.15.5. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-tracking-async-operations\">Tracking asynchronous operations by the mbarrier object</a><a class=\"headerlink\" href=\"#tracking-asynchronous-operations-by-the-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p>Starting with the Hopper architecture (<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_9x</span></code>), <em>mbarrier object</em> supports a new count, called\n<em>tx-count</em>, which is used for tracking the completion of asynchronous memory operations or\ntransactions. <em>tx-count</em> tracks the number of asynchronous transactions, in units specified by the\nasynchronous memory operation, that are outstanding and yet to be complete.</p>\n<p>The <em>tx-count</em> of an <em>mbarrier object</em> must be set to the total amount of asynchronous memory\noperations, in units as specified by the asynchronous operations, to be tracked by the current\nphase. Upon completion of each of the asynchronous operations, the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation will be performed on the <em>mbarrier object</em> and thus progress the mbarrier towards the\ncompletion of the current phase.</p>\n<section id=\"expect-tx-operation\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\"></span><h6>\n<span class=\"section-number\">9.7.14.15.5.1. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx operation</a><a class=\"headerlink\" href=\"#expect-tx-operation\" title=\"Permalink to this headline\">\uf0c1</a>\n</h6>\n<p>The <em>expect-tx</em> operation, with an <code class=\"docutils literal notranslate\"><span class=\"pre\">expectCount</span></code> argument, increases the <em>tx-count</em> of an\n<em>mbarrier object</em> by the value specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">expectCount</span></code>. This makes the current phase of the\n<em>mbarrier object</em> to expect and track the completion of additional asynchronous transactions.</p>\n</section>\n<section id=\"complete-tx-operation\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\"></span><h6>\n<span class=\"section-number\">9.7.14.15.5.2. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx operation</a><a class=\"headerlink\" href=\"#complete-tx-operation\" title=\"Permalink to this headline\">\uf0c1</a>\n</h6>\n<p>The <em>complete-tx</em> operation, with an <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument, on an <em>mbarrier object</em> consists of the following:</p>\n<dl class=\"simple\">\n<dt>mbarrier signaling</dt>\n<dd>\n<p>Signals the completion of asynchronous transactions that were tracked by the current phase. As a\nresult of this, <em>tx-count</em> is decremented by <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code>.</p>\n</dd>\n<dt>mbarrier potentially completing the current phase</dt>\n<dd>\n<p>If the current phase has been completed then the mbarrier transitions to the next phase. Refer to\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-phase-completion\">Phase Completion of the mbarrier object</a>\nfor details on phase completion requirements and phase transition process.</p>\n</dd>\n</dl>\n</section>\n</section>\n<section id=\"phase-completion-of-the-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-phase-completion\"></span><h5>\n<span class=\"section-number\">9.7.14.15.6. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-phase-completion\">Phase Completion of the mbarrier object</a><a class=\"headerlink\" href=\"#phase-completion-of-the-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p>The requirements for completion of the current phase are described below. Upon completion of the\ncurrent phase, the phase transitions to the subsequent phase as described below.</p>\n<dl class=\"simple\">\n<dt>Current phase completion requirements</dt>\n<dd>\n<p>An <em>mbarrier object</em> completes the current phase when all of the following conditions are met:</p>\n<ul class=\"simple\">\n<li><p>The count of the pending arrivals has reached zero.</p></li>\n<li><p>The <em>tx-count</em> has reached zero.</p></li>\n</ul>\n</dd>\n<dt>Phase transition</dt>\n<dd>\n<p>When an <em>mbarrier</em> object completes the current phase, the following actions are performed\natomically:</p>\n<ul class=\"simple\">\n<li><p>The <em>mbarrier object</em> transitions to the next phase.</p></li>\n<li><p>The pending arrival count is reinitialized to the expected arrival count.</p></li>\n</ul>\n</dd>\n</dl>\n</section>\n<section id=\"arrive-on-operation-on-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\"></span><h5>\n<span class=\"section-number\">9.7.14.15.7. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">Arrive-on operation on mbarrier object</a><a class=\"headerlink\" href=\"#arrive-on-operation-on-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p>An <em>arrive-on</em> operation, with an optional <em>count</em> argument, on an <em>mbarrier object</em> consists of the\nfollowing 2 steps :</p>\n<ul>\n<li>\n<p>mbarrier signalling:</p>\n<p>Signals the arrival of the executing thread OR completion of the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> instruction which\nsignals the arrive-on operation initiated by the executing thread on the <em>mbarrier object</em>. As a\nresult of this, the pending arrival count is decremented by <em>count</em>. If the <em>count</em> argument is\nnot specified, then it defaults to 1.</p>\n</li>\n<li>\n<p>mbarrier potentially completing the current phase:</p>\n<p>If the current phase has been completed then the mbarrier transitions to the next phase. Refer to\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-phase-completion\">Phase Completion of the mbarrier object</a>\nfor details on phase completion requirements and phase transition process.</p>\n</li>\n</ul>\n</section>\n<section id=\"mbarrier-support-with-shared-memory\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-smem\"></span><h5>\n<span class=\"section-number\">9.7.14.15.8. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-smem\">mbarrier support with shared memory</a><a class=\"headerlink\" href=\"#mbarrier-support-with-shared-memory\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p>The following table summarizes the support of various mbarrier operations on <em>mbarrier objects</em>\nlocated at different shared memory locations:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 35%\"/>\n<col style=\"width: 23%\"/>\n<col style=\"width: 42%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>mbarrier operations</p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code></p></td>\n<td><p>Supported</p></td>\n<td><p>Supported, cannot return result</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.expect_tx</span></code></p></td>\n<td><p>Supported</p></td>\n<td><p>Supported</p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.complete_tx</span></code></p></td>\n<td><p>Supported</p></td>\n<td><p>Supported</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p>Other mbarrier operations</p></td>\n<td><p>Supported</p></td>\n<td><p>Not supported</p></td>\n</tr>\n</tbody>\n</table>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-init\">\n<h5>\n<span class=\"section-number\">9.7.14.15.9. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-init\">Parallel Synchronization and Communication Instructions: mbarrier.init</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-init\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p><strong>mbarrier.init</strong></p>\n<p>Initialize the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.init{.shared{::cta}}.b64 [addr], count;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code> initializes the <em>mbarrier object</em> at the location specified by the address operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> with the unsigned 32-bit integer <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>. The value of operand count must be in the range\nas specified in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier object</a>.</p>\n<p>Initialization of the <em>mbarrier object</em> involves :</p>\n<ul class=\"simple\">\n<li><p>Initializing the current phase to 0.</p></li>\n<li><p>Initializing the expected arrival count to <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>.</p></li>\n<li><p>Initializing the pending arrival count to <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>.</p></li>\n<li><p>Initializing the <em>tx-count</em> to 0.</p></li>\n</ul>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.shared .b64 shMem, shMem2;\n.reg    .b64 addr;\n.reg    .b32 %r1;\n\ncvta.shared.u64          addr, shMem2;\nmbarrier.init.b64        [addr],   %r1;\nbar.cta.sync             0;\n// ... other mbarrier operations on addr\n\nmbarrier.init.shared::cta.b64 [shMem], 12;\nbar.sync                 0;\n// ... other mbarrier operations on shMem\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-inval\">\n<h5>\n<span class=\"section-number\">9.7.14.15.10. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-inval\">Parallel Synchronization and Communication Instructions: mbarrier.inval</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-inval\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p><strong>mbarrier.inval</strong></p>\n<p>Invalidates the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.inval{.shared{::cta}}.b64 [addr];\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.inval</span></code> invalidates the <em>mbarrier object</em> at the location specified by the address\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>.</p>\n<p>An <em>mbarrier object</em> must be invalidated before using its memory location for any other purpose.</p>\n<p>Performing any <em>mbarrier</em> operation except <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code> on an invalidated mbarrier object\nresults in undefined behaviour.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.shared .b64 shmem;\n.reg    .b64 addr;\n.reg    .b32 %r1;\n.reg    .pred t0;\n\n// Example 1 :\nbar.sync                      0;\n@t0 mbarrier.init.b64     [addr], %r1;\n// ... other mbarrier operations on addr\nbar.sync                      0;\n@t0 mbarrier.inval.b64    [addr];\n\n\n// Example 2 :\nbar.cta.sync                  0;\nmbarrier.init.shared.b64           [shmem], 12;\n// ... other mbarrier operations on shmem\nbar.cta.sync                  0;\n@t0 mbarrier.inval.shared.b64      [shmem];\n\n// shmem can be reused here for unrelated use :\nbar.cta.sync                  0;\nst.shared.b64                      [shmem], ...;\n\n// shmem can be re-initialized as mbarrier object :\nbar.cta.sync                  0;\n@t0 mbarrier.init.shared.b64       [shmem], 24;\n// ... other mbarrier operations on shmem\nbar.cta.sync                  0;\n@t0 mbarrier.inval.shared::cta.b64 [shmem];\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-expect-tx\">\n<h5>\n<span class=\"section-number\">9.7.14.15.11. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx\">Parallel Synchronization and Communication Instructions: mbarrier.expect_tx</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p><strong>mbarrier.expect_tx</strong></p>\n<p>Perfoms <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a> operation on the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.expect_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n.scope = { .cta, .cluster }\n.space = { .shared{::cta}, .shared::cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.expect_tx</span></code> performs an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a>\noperation on the <em>mbarrier object</em> at the location specified by the address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The\n32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <code class=\"docutils literal notranslate\"><span class=\"pre\">expectCount</span></code> argument to the\n<em>expect-tx</em> operation.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> are as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>This operation does not provide any memory ordering semantics and thus is a <em>relaxed</em> operation.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.expect_tx.b64                       [addr], 32;\nmbarrier.expect_tx.relaxed.cta.shared.b64    [mbarObj1], 512;\nmbarrier.expect_tx.relaxed.cta.shared.b64    [mbarObj2], 512;\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-complete-tx\">\n<h5>\n<span class=\"section-number\">9.7.14.15.12. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx\">Parallel Synchronization and Communication Instructions: mbarrier.complete_tx</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p><strong>mbarrier.complete_tx</strong></p>\n<p>Perfoms <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.complete_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n.scope = { .cta, .cluster }\n.space = { .shared{::cta}, .shared::cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.complete_tx</span></code> performs a <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the <em>mbarrier object</em> at the location specified by the address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The\n32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument to the\n<em>complete-tx</em> operation.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.complete_tx</span></code> does not involve any asynchronous memory operations and only simulates the\ncompletion of an asynchronous memory operation and its side effect of signaling to the <em>mbarrier\nobject</em>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> are as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>This operation does not provide any memory ordering semantics and thus is a <em>relaxed</em> operation.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.complete_tx.b64             [addr],     32;\nmbarrier.complete_tx.shared.b64      [mbarObj1], 512;\nmbarrier.complete_tx.relaxed.cta.b64 [addr2],    32;\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-arrive\">\n<h5>\n<span class=\"section-number\">9.7.14.15.13. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive\">Parallel Synchronization and Communication Instructions: mbarrier.arrive</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p><strong>mbarrier.arrive</strong></p>\n<p>Performs <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> on the\n<em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64           state, [addr]{, count};\nmbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64         _, [addr] {,count}\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], txCount;\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64   _, [addr], txCount;\nmbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64  state, [addr], count;\n\n.sem   = { .release }\n.scope = { .cta, .cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> performs an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a> operation\non the <em>mbarrier object</em> at the location specified by the address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The 32-bit\nunsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> specifies the <em>count</em> argument to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperation.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>The optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> specifies that an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a>\noperation is performed prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperation. The 32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <em>expectCount</em> argument to\nthe <em>expect-tx</em> operation. When both qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.arrive</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> are specified, then\nthe count argument of the <em>arrive-on</em> operation is assumed to be 1.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> operation with <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> qualifier must not cause the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier</span></code> to\ncomplete its current phase, otherwise the behavior is undefined.</p>\n<p>The value of the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> must be in the range as specified in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier\nobject</a>.</p>\n<p>Note: for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_8x</span></code>, when the argument <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> is specified, the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> is\nrequired.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> operation on an <em>mbarrier object</em> located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> returns an opaque\n64-bit register capturing the phase of the <em>mbarrier object</em> prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> in the\ndestination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">state.</span></code> Contents of the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> operand are implementation\nspecific. Optionally, sink symbol <code class=\"docutils literal notranslate\"><span class=\"pre\">'_'</span></code> can be used for the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> argument.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> operation on an <em>mbarrier object</em> located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> but not in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> cannot return a value. Sink symbol \u2018_\u2019 is mandatory for the destination operand for\nsuch cases.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier specifies a memory synchronizing effect as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory\nConsistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier is absent,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> is assumed by default.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier indicates the set of threads that directly observe the memory\nsynchronizing effect of this operation, as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified then it\ndefaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code>. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::&lt;scope&gt;</span></code> indicates the state space where the\nmbarrier resides.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sink symbol \u2018_\u2019 as the destination operand is introduced in PTX ISA version 7.1.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> introduced in PTX ISA version\n7.8.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> introduced in PTX ISA version 8.0.</p>\n<p>Support for qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> is introduced in PTX ISA version 8.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifiers introduced in PTX ISA version 8.0</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b32 cnt, remoteAddr32, remoteCTAId, addr32;\n.reg .b64 %r&lt;3&gt;, addr, remoteAddr64;\n.shared .b64 shMem, shMem2;\n\ncvta.shared.u64            addr, shMem2;\nmov.b32                    addr32, shMem2;\nmapa.shared::cluster.u32   remoteAddr32, addr32, remoteCTAId;\nmapa.u64                   remoteAddr64, addr,   remoteCTAId;\n\ncvta.shared.u64          addr, shMem2;\n\nmbarrier.arrive.shared.b64                       %r0, [shMem];\nmbarrier.arrive.shared::cta.b64                  %r0, [shMem2];\nmbarrier.arrive.release.cta.shared::cluster.b64  _, [remoteAddr32];\nmbarrier.arrive.release.cluster.b64              _, [remoteAddr64], cnt;\nmbarrier.arrive.expect_tx.release.cluster.b64    _, [remoteAddr64], tx_count;\nmbarrier.arrive.noComplete.b64                   %r1, [addr], 2;\nmbarrier.arrive.b64                              %r2, [addr], cnt;\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop\">\n<h5>\n<span class=\"section-number\">9.7.14.15.14. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop\">Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p><strong>mbarrier.arrive_drop</strong></p>\n<p>Decrements the expected count of the <em>mbarrier object</em> and performs <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.arrive_drop{.sem}{.scope}{.shared{::cta}}.b64 state,           [addr]{, count};\nmbarrier.arrive_drop{.sem}{.scope}{.shared::cluster}.b64           _,   [addr] {,count};\nmbarrier.arrive_drop.expect_tx{.shared{::cta}}{.sem}{.scope}.b64 state, [addr], tx_count;\nmbarrier.arrive_drop.expect_tx{.shared::cluster}{.sem}{.scope}.b64   _, [addr], tx_count;\nmbarrier.arrive_drop.noComplete{.sem}{.cta}{.shared{::cta}}.b64 state,  [addr], count;\n\n.sem   = { .release }\n.scope = { .cta, .cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> on the <em>mbarrier object</em> at the location specified by\nthe address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> performs the following steps:</p>\n<ul class=\"simple\">\n<li><p>Decrements the expected arrival count of the <em>mbarrier object</em> by the value specified by the\n32-bit integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>. If <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> operand is not specified, it defaults to 1.</p></li>\n<li><p>Performs an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> on the\n<em>mbarrier object</em>. The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> specifies the <em>count</em> argument to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on\noperation</a>.</p></li>\n</ul>\n<p>The decrement done in the expected arrivals count of the <em>mbarrier object</em> will be for all the\nsubsequent phases of the <em>mbarrier object</em>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>The optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> specifies that an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a>\noperation is performed prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperation. The 32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <em>expectCount</em> argument to\nthe <em>expect-tx</em> operation. When both qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.arrive</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> are specified, then\nthe count argument of the <em>arrive-on</em> operation is assumed to be 1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> operation forms the <em>release</em> pattern as described in the Memory\nConsistency Model and synchronizes with the <em>acquire</em> patterns.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier indicates the set of threads that an <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code>\ninstruction can directly synchronize. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified then it defaults\nto <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code>. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::&lt;scope&gt;</span></code> indicates the state space where the mbarrier\nresides.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> qualifier must not complete the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier,</span></code>\notherwise the behavior is undefined.</p>\n<p>The value of the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> must be in the range as specified in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier\nobject</a>.</p>\n<p>Note: for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_8x</span></code>, when the argument <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> is specified, the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> is\nrequired.</p>\n<p>A thread that wants to either exit or opt out of participating in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> can use\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> to drop itself from the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> operation on an <em>mbarrier object</em> located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> returns an\nopaque 64-bit register capturing the phase of the <em>mbarrier object</em> prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on\noperation</a>\nin the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code>. Contents of the returned state are implementation\nspecific. Optionally, sink symbol <code class=\"docutils literal notranslate\"><span class=\"pre\">'_'</span></code> can be used for the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> argument.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> operation on an <em>mbarrier</em> object located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> but not\nin <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> cannot return a value. Sink symbol \u2018_\u2019 is mandatory for the destination operand\nfor such cases.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> introduced in PTX ISA version\n7.8.</p>\n<p>Support for qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> is introduced in PTX ISA version 8.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> introduced in PTX ISA version 8.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifiers introduced in PTX ISA version 8.0</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b32 cnt;\n.reg .b64 %r1;\n.shared .b64 shMem;\n\n// Example 1\n@p mbarrier.arrive_drop.shared.b64 _, [shMem];\n@p exit;\n@p2 mbarrier.arrive_drop.noComplete.shared.b64 _, [shMem], %a;\n@p2 exit;\n..\n@!p mbarrier.arrive.shared.b64   %r1, [shMem];\n@!p mbarrier.test_wait.shared.b64  q, [shMem], %r1;\n\n// Example 2\nmbarrier.arrive_drop.shared::cluster.b64 _, [addr];\nmbarrier.arrive_drop.shared::cta.release.cluster.b64     _, [addr], cnt;\n\n// Example 3\nmbarrier.arrive_drop.expect_tx.shared::cta.release.cta.b64 state, [addr], tx_count;\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive\">\n<h5>\n<span class=\"section-number\">9.7.14.15.15. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive\">Parallel Synchronization and Communication Instructions: cp.async.mbarrier.arrive</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p><strong>cp.async.mbarrier.arrive</strong></p>\n<p>Makes the <em>mbarrier object</em> track all prior <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a> operations initiated by the\nexecuting thread.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>cp.async.mbarrier.arrive{.noinc}{.shared{::cta}}.b64 [addr];\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Causes an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> to be\ntriggered by the system on the <em>mbarrier object</em> upon the completion of all prior <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a> operations initiated by the\nexecuting thread. The <em>mbarrier object</em> is at the location specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> is\nasynchronous to execution of <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.mbarrier.arrive</span></code>.</p>\n<p>When <code class=\"docutils literal notranslate\"><span class=\"pre\">.noinc</span></code> modifier is not specified, the pending count of the mbarrier object is incremented\nby 1 prior to the asynchronous <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a>. This\nresults in a zero-net change for the pending count from the asynchronous <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a> operation\nduring the current phase. The pending count of the <em>mbarrier object</em> after the increment should not\nexceed the limit as mentioned in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier object</a>. Otherwise,\nthe behavior is undefined.</p>\n<p>When the <code class=\"docutils literal notranslate\"><span class=\"pre\">.noinc</span></code> modifier is specified, the increment to the pending count of the <em>mbarrier\nobject</em> is not performed. Hence the decrement of the pending count done by the asynchronous\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> must be\naccounted for in the initialization of the <em>mbarrier object</em>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Example 1: no .noinc\nmbarrier.init.shared.b64 [shMem], threadCount;\n....\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n....\n// Absence of .noinc accounts for arrive-on from completion of prior cp.async operations.\n// So mbarrier.init must only account for arrive-on from mbarrier.arrive.\ncp.async.mbarrier.arrive.shared.b64 [shMem];\n....\nmbarrier.arrive.shared.b64 state, [shMem];\n\nwaitLoop:\nmbarrier.test_wait.shared.b64 p, [shMem], state;\n@!p bra waitLoop;\n\n\n\n// Example 2: with .noinc\n\n// Tracks arrive-on from mbarrier.arrive and cp.async.mbarrier.arrive.\n\n// All threads participating in the mbarrier perform cp.async\nmov.b32 copyOperationCnt, threadCount;\n\n// 3 arrive-on operations will be triggered per-thread\nmul.lo.u32 copyArrivalCnt, copyOperationCnt, 3;\n\nadd.u32 totalCount, threadCount, copyArrivalCnt;\n\nmbarrier.init.shared.b64 [shMem], totalCount;\n....\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n...\n// Presence of .noinc requires mbarrier initalization to have accounted for arrive-on from cp.async\ncp.async.mbarrier.arrive.noinc.shared.b64 [shMem]; // 1st instance\n....\ncp.async.ca.shared.global [shard3], [gbl3], 4;\ncp.async.ca.shared.global [shard4], [gbl4], 16;\ncp.async.mbarrier.arrive.noinc.shared::cta.b64 [shMem]; // 2nd instance\n....\ncp.async.ca.shared.global [shard5], [gbl5], 4;\ncp.async.cg.shared.global [shard6], [gbl6], 16;\ncp.async.mbarrier.arrive.noinc.shared.b64 [shMem]; // 3rd and last instance\n....\nmbarrier.arrive.shared.b64 state, [shMem];\n\nwaitLoop:\nmbarrier.test_wait.shared.b64 p, [shMem], state;\n@!p bra waitLoop;\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait\"></span><h5>\n<span class=\"section-number\">9.7.14.15.16. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait\">Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p><strong>mbarrier.test_wait/mbarrier.try_wait</strong></p>\n<p>Checks whether the <em>mbarrier object</em> has completed the phase.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.test_wait{.sem}{.scope}{.shared{::cta}}.b64        waitComplete, [addr], state;\nmbarrier.test_wait.parity{.sem}{.scope}{.shared{::cta}}.b64 waitComplete, [addr], phaseParity;\n\nmbarrier.try_wait{.sem}{.scope}{.shared{::cta}}.b64         waitComplete, [addr], state\n                                                               {, suspendTimeHint};\n\nmbarrier.try_wait.parity{.sem}{.scope}{.shared{::cta}}.b64  waitComplete, [addr], phaseParity\n                                                               {, suspendTimeHint};\n\n.sem   = { .acquire }\n.scope = { .cta, .cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <em>test_wait</em> and <em>try_wait</em> operations test for the completion of the current or the immediately\npreceding phase of an <em>mbarrier object</em> at the location specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> is a non-blocking instruction which tests for the completion of the phase.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> is a potentially blocking instruction which tests for the completion of the\nphase. If the phase is not complete, the executing thread may be suspended. Suspended thread resumes\nexecution when the specified phase completes OR before the phase completes following a\nsystem-dependent time limit. The optional 32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">suspendTimeHint</span></code>\nspecifies the time limit, in nanoseconds, that may be used for the time limit instead of the\nsystem-dependent limit.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> test for completion of the phase :</p>\n<ul class=\"simple\">\n<li><p>Specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code>, which was returned by an <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> instruction on\nthe same <em>mbarrier object</em> during the current or the immediately preceding phase. Or</p></li>\n<li><p>Indicated by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">phaseParity</span></code>, which is the integer parity of either the current phase\nor the immediately preceding phase of the <em>mbarrier object</em>.</p></li>\n</ul>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.parity</span></code> variant of the instructions test for the completion of the phase indicated by the\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">phaseParity</span></code>, which is the integer parity of either the current phase or the immediately\npreceding phase of the <em>mbarrier object</em>. An even phase has integer parity 0 and an odd phase has\ninteger parity of 1. So the valid values of <code class=\"docutils literal notranslate\"><span class=\"pre\">phaseParity</span></code> operand are 0 and 1.</p>\n<p>Note: the use of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.parity</span></code> variants of the instructions requires tracking the phase of an\n<em>mbarrier object</em> throughout its lifetime.</p>\n<p>The <em>test_wait</em> and <em>try_wait</em> operations are valid only for :</p>\n<ul class=\"simple\">\n<li><p>the current incomplete phase, for which <code class=\"docutils literal notranslate\"><span class=\"pre\">waitComplete</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>.</p></li>\n<li><p>the immediately preceding phase, for which <code class=\"docutils literal notranslate\"><span class=\"pre\">waitComplete</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>.</p></li>\n</ul>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>When <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> operations return <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>, they form the\n<em>acquire</em> pattern as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier indicates the set of threads that the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> instructions can directly synchronize. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not\nspecified then it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code>. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::&lt;scope&gt;</span></code> indicates the state\nspace where the mbarrier resides.</p>\n<p>The following ordering of memory operations hold for the executing thread when\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> :</p>\n<ol class=\"arabic simple\">\n<li><p>All memory accesses (except <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">async operations</a> ) requested prior, in program\norder, to <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> during the completed phase by the participating threads of the CTA\nare performed and are visible to the executing thread.</p></li>\n<li><p>All <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a> operations\nrequested prior, in program order, to <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.mbarrier.arrive</span></code> during the completed phase by\nthe participating threads of the CTA are performed and made visible to the executing thread.</p></li>\n<li><p>All <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> asynchronous operations using the same <em>mbarrier object</em> requested prior,\nin program order, to <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> during the completed phase by the participating threads\nof the CTA are performed and made visible to the executing thread.</p></li>\n<li><p>All memory accesses requested after the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code>, in\nprogram order, are not performed and not visible to memory accesses performed prior to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code>, in program order, by other threads participating in the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier</span></code>.</p></li>\n<li><p>There is no ordering and visibility guarantee for memory accesses requested by the thread after\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> and prior to <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code>, in program order.</p></li>\n</ol>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> introduced in PTX ISA version 7.0.</p>\n<p>Modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.parity</span></code> is introduced in PTX ISA version 7.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifiers introduced in PTX ISA version 8.0</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Example 1a, thread synchronization with test_wait:\n\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nmbarrier.arrive.shared.b64  %r1, [shMem]; // N threads executing mbarrier.arrive\n\n// computation not requiring mbarrier synchronization...\n\nwaitLoop:\nmbarrier.test_wait.shared.b64    complete, [shMem], %r1;\n@!complete nanosleep.u32 20;\n@!complete bra waitLoop;\n\n// Example 1b, thread synchronization with try_wait :\n\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nmbarrier.arrive.shared.b64  %r1, [shMem]; // N threads executing mbarrier.arrive\n\n// computation not requiring mbarrier synchronization...\n\nwaitLoop:\nmbarrier.try_wait.shared.b64    complete, [shMem], %r1;\n@!complete bra waitLoop;\n\n\n// Example 2, thread synchronization using phase parity :\n\n.reg .b32 i, parArg;\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmov.b32 i, 0;\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nloopStart :                           // One phase per loop iteration\n    ...\n    mbarrier.arrive.shared.b64  %r1, [shMem]; // N threads\n    ...\n    and.b32 parArg, i, 1;\n    waitLoop:\n    mbarrier.test_wait.parity.shared.b64  complete, [shMem], parArg;\n    @!complete nanosleep.u32 20;\n    @!complete bra waitLoop;\n    ...\n    add.u32 i, i, 1;\n    setp.lt.u32 p, i, IterMax;\n@p bra loopStart;\n\n\n// Example 3, Asynchronous copy completion waiting :\n\n.reg .b64 state;\n.shared .b64 shMem2;\n.shared .b64 shard1, shard2;\n.global .b64 gbl1, gbl2;\n\nmbarrier.init.shared.b64 [shMem2], threadCount;\n...\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n\n// Absence of .noinc accounts for arrive-on from prior cp.async operation\ncp.async.mbarrier.arrive.shared.b64 [shMem2];\n...\nmbarrier.arrive.shared.b64 state, [shMem2];\n\nwaitLoop:\nmbarrier.test_wait.shared::cta.b64 p, [shMem2], state;\n@!p bra waitLoop;\n\n// Example 4, Synchronizing the CTA0 threads with cluster threads\n.reg .b64 %r1, addr, remAddr;\n.shared .b64 shMem;\n\ncvta.shared.u64          addr, shMem;\nmapa.u64                 remAddr, addr, 0;     // CTA0\u2019s shMem instance\n\n// One thread from CTA0 executing the below initialization operation\n@p0 mbarrier.init.shared::cta.b64 [shMem], N;  // N = no of cluster threads\n\nbarrier.cluster.arrive;\nbarrier.cluster.wait;\n\n// Entire cluster executing the below arrive operation\nmbarrier.arrive.release.cluster.b64              _, [remAddr];\n\n// computation not requiring mbarrier synchronization ...\n\n// Only CTA0 threads executing the below wait operation\nwaitLoop:\nmbarrier.try_wait.parity.acquire.cluser.shared::cta.b64  complete, [shMem], 0;\n@!complete bra waitLoop;\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-pending-count\">\n<h5>\n<span class=\"section-number\">9.7.14.15.17. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-pending-count\">Parallel Synchronization and Communication Instructions: mbarrier.pending_count</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-pending-count\" title=\"Permalink to this headline\">\uf0c1</a>\n</h5>\n<p><strong>mbarrier.pending_count</strong></p>\n<p>Query the pending arrival count from the opaque mbarrier state.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.pending_count.b64 count, state;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The pending count can be queried from the opaque mbarrier state using <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.pending_count</span></code>.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> operand is a 64-bit register that must be the result of a prior\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive.noComplete</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop.noComplete</span></code> instruction. Otherwise, the\nbehavior is undefined.</p>\n<p>The destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> is a 32-bit unsigned integer representing the pending count of\nthe <em>mbarrier object</em> prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> from\nwhich the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> register was obtained.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b32 %r1;\n.reg .b64 state;\n.shared .b64 shMem;\n\nmbarrier.arrive.noComplete.b64 state, [shMem], 1;\nmbarrier.pending_count.b64 %r1, state;\n</pre></div>\n</div>\n</section>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.arrive</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-arrive\">\n\n\n<p>Performs <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> on the\n<em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64           state, [addr]{, count};\nmbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64         _, [addr] {,count}\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], txCount;\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64   _, [addr], txCount;\nmbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64  state, [addr], count;\n\n.sem   = { .release }\n.scope = { .cta, .cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> performs an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a> operation\non the <em>mbarrier object</em> at the location specified by the address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The 32-bit\nunsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> specifies the <em>count</em> argument to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperation.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>The optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> specifies that an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a>\noperation is performed prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperation. The 32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <em>expectCount</em> argument to\nthe <em>expect-tx</em> operation. When both qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.arrive</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> are specified, then\nthe count argument of the <em>arrive-on</em> operation is assumed to be 1.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> operation with <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> qualifier must not cause the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier</span></code> to\ncomplete its current phase, otherwise the behavior is undefined.</p>\n<p>The value of the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> must be in the range as specified in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier\nobject</a>.</p>\n<p>Note: for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_8x</span></code>, when the argument <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> is specified, the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> is\nrequired.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> operation on an <em>mbarrier object</em> located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> returns an opaque\n64-bit register capturing the phase of the <em>mbarrier object</em> prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> in the\ndestination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">state.</span></code> Contents of the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> operand are implementation\nspecific. Optionally, sink symbol <code class=\"docutils literal notranslate\"><span class=\"pre\">'_'</span></code> can be used for the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> argument.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> operation on an <em>mbarrier object</em> located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> but not in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> cannot return a value. Sink symbol \u2018_\u2019 is mandatory for the destination operand for\nsuch cases.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier specifies a memory synchronizing effect as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory\nConsistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier is absent,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> is assumed by default.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier indicates the set of threads that directly observe the memory\nsynchronizing effect of this operation, as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified then it\ndefaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code>. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::&lt;scope&gt;</span></code> indicates the state space where the\nmbarrier resides.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sink symbol \u2018_\u2019 as the destination operand is introduced in PTX ISA version 7.1.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> introduced in PTX ISA version\n7.8.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> introduced in PTX ISA version 8.0.</p>\n<p>Support for qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> is introduced in PTX ISA version 8.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifiers introduced in PTX ISA version 8.0</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b32 cnt, remoteAddr32, remoteCTAId, addr32;\n.reg .b64 %r&lt;3&gt;, addr, remoteAddr64;\n.shared .b64 shMem, shMem2;\n\ncvta.shared.u64            addr, shMem2;\nmov.b32                    addr32, shMem2;\nmapa.shared::cluster.u32   remoteAddr32, addr32, remoteCTAId;\nmapa.u64                   remoteAddr64, addr,   remoteCTAId;\n\ncvta.shared.u64          addr, shMem2;\n\nmbarrier.arrive.shared.b64                       %r0, [shMem];\nmbarrier.arrive.shared::cta.b64                  %r0, [shMem2];\nmbarrier.arrive.release.cta.shared::cluster.b64  _, [remoteAddr32];\nmbarrier.arrive.release.cluster.b64              _, [remoteAddr64], cnt;\nmbarrier.arrive.expect_tx.release.cluster.b64    _, [remoteAddr64], tx_count;\nmbarrier.arrive.noComplete.b64                   %r1, [addr], 2;\nmbarrier.arrive.b64                              %r2, [addr], cnt;\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop\">\n\n\n<p>Decrements the expected count of the <em>mbarrier object</em> and performs <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.arrive_drop{.sem}{.scope}{.shared{::cta}}.b64 state,           [addr]{, count};\nmbarrier.arrive_drop{.sem}{.scope}{.shared::cluster}.b64           _,   [addr] {,count};\nmbarrier.arrive_drop.expect_tx{.shared{::cta}}{.sem}{.scope}.b64 state, [addr], tx_count;\nmbarrier.arrive_drop.expect_tx{.shared::cluster}{.sem}{.scope}.b64   _, [addr], tx_count;\nmbarrier.arrive_drop.noComplete{.sem}{.cta}{.shared{::cta}}.b64 state,  [addr], count;\n\n.sem   = { .release }\n.scope = { .cta, .cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> on the <em>mbarrier object</em> at the location specified by\nthe address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> performs the following steps:</p>\n<ul class=\"simple\">\n<li><p>Decrements the expected arrival count of the <em>mbarrier object</em> by the value specified by the\n32-bit integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>. If <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> operand is not specified, it defaults to 1.</p></li>\n<li><p>Performs an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> on the\n<em>mbarrier object</em>. The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> specifies the <em>count</em> argument to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on\noperation</a>.</p></li>\n</ul>\n<p>The decrement done in the expected arrivals count of the <em>mbarrier object</em> will be for all the\nsubsequent phases of the <em>mbarrier object</em>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>The optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> specifies that an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a>\noperation is performed prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperation. The 32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <em>expectCount</em> argument to\nthe <em>expect-tx</em> operation. When both qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.arrive</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> are specified, then\nthe count argument of the <em>arrive-on</em> operation is assumed to be 1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> operation forms the <em>release</em> pattern as described in the Memory\nConsistency Model and synchronizes with the <em>acquire</em> patterns.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier indicates the set of threads that an <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code>\ninstruction can directly synchronize. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified then it defaults\nto <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code>. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::&lt;scope&gt;</span></code> indicates the state space where the mbarrier\nresides.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> qualifier must not complete the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier,</span></code>\notherwise the behavior is undefined.</p>\n<p>The value of the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> must be in the range as specified in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier\nobject</a>.</p>\n<p>Note: for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_8x</span></code>, when the argument <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> is specified, the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> is\nrequired.</p>\n<p>A thread that wants to either exit or opt out of participating in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> can use\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> to drop itself from the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> operation on an <em>mbarrier object</em> located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> returns an\nopaque 64-bit register capturing the phase of the <em>mbarrier object</em> prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on\noperation</a>\nin the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code>. Contents of the returned state are implementation\nspecific. Optionally, sink symbol <code class=\"docutils literal notranslate\"><span class=\"pre\">'_'</span></code> can be used for the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> argument.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> operation on an <em>mbarrier</em> object located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> but not\nin <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> cannot return a value. Sink symbol \u2018_\u2019 is mandatory for the destination operand\nfor such cases.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> introduced in PTX ISA version\n7.8.</p>\n<p>Support for qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> is introduced in PTX ISA version 8.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> introduced in PTX ISA version 8.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifiers introduced in PTX ISA version 8.0</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b32 cnt;\n.reg .b64 %r1;\n.shared .b64 shMem;\n\n// Example 1\n@p mbarrier.arrive_drop.shared.b64 _, [shMem];\n@p exit;\n@p2 mbarrier.arrive_drop.noComplete.shared.b64 _, [shMem], %a;\n@p2 exit;\n..\n@!p mbarrier.arrive.shared.b64   %r1, [shMem];\n@!p mbarrier.test_wait.shared.b64  q, [shMem], %r1;\n\n// Example 2\nmbarrier.arrive_drop.shared::cluster.b64 _, [addr];\nmbarrier.arrive_drop.shared::cta.release.cluster.b64     _, [addr], cnt;\n\n// Example 3\nmbarrier.arrive_drop.expect_tx.shared::cta.release.cta.b64 state, [addr], tx_count;\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.complete_tx</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-complete-tx\">\n\n\n<p>Perfoms <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.complete_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n.scope = { .cta, .cluster }\n.space = { .shared{::cta}, .shared::cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.complete_tx</span></code> performs a <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the <em>mbarrier object</em> at the location specified by the address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The\n32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument to the\n<em>complete-tx</em> operation.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.complete_tx</span></code> does not involve any asynchronous memory operations and only simulates the\ncompletion of an asynchronous memory operation and its side effect of signaling to the <em>mbarrier\nobject</em>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> are as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>This operation does not provide any memory ordering semantics and thus is a <em>relaxed</em> operation.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.complete_tx.b64             [addr],     32;\nmbarrier.complete_tx.shared.b64      [mbarObj1], 512;\nmbarrier.complete_tx.relaxed.cta.b64 [addr2],    32;\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.expect_tx</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-expect-tx\">\n\n\n<p>Perfoms <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a> operation on the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.expect_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n.scope = { .cta, .cluster }\n.space = { .shared{::cta}, .shared::cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.expect_tx</span></code> performs an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a>\noperation on the <em>mbarrier object</em> at the location specified by the address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The\n32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <code class=\"docutils literal notranslate\"><span class=\"pre\">expectCount</span></code> argument to the\n<em>expect-tx</em> operation.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> are as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>This operation does not provide any memory ordering semantics and thus is a <em>relaxed</em> operation.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.expect_tx.b64                       [addr], 32;\nmbarrier.expect_tx.relaxed.cta.shared.b64    [mbarObj1], 512;\nmbarrier.expect_tx.relaxed.cta.shared.b64    [mbarObj2], 512;\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.init</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-init\">\n\n\n<p>Initialize the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.init{.shared{::cta}}.b64 [addr], count;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code> initializes the <em>mbarrier object</em> at the location specified by the address operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> with the unsigned 32-bit integer <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>. The value of operand count must be in the range\nas specified in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier object</a>.</p>\n<p>Initialization of the <em>mbarrier object</em> involves :</p>\n<ul class=\"simple\">\n<li><p>Initializing the current phase to 0.</p></li>\n<li><p>Initializing the expected arrival count to <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>.</p></li>\n<li><p>Initializing the pending arrival count to <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>.</p></li>\n<li><p>Initializing the <em>tx-count</em> to 0.</p></li>\n</ul>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.shared .b64 shMem, shMem2;\n.reg    .b64 addr;\n.reg    .b32 %r1;\n\ncvta.shared.u64          addr, shMem2;\nmbarrier.init.b64        [addr],   %r1;\nbar.cta.sync             0;\n// ... other mbarrier operations on addr\n\nmbarrier.init.shared::cta.b64 [shMem], 12;\nbar.sync                 0;\n// ... other mbarrier operations on shMem\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.inval</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-inval\">\n\n\n<p>Invalidates the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.inval{.shared{::cta}}.b64 [addr];\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.inval</span></code> invalidates the <em>mbarrier object</em> at the location specified by the address\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>.</p>\n<p>An <em>mbarrier object</em> must be invalidated before using its memory location for any other purpose.</p>\n<p>Performing any <em>mbarrier</em> operation except <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code> on an invalidated mbarrier object\nresults in undefined behaviour.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.shared .b64 shmem;\n.reg    .b64 addr;\n.reg    .b32 %r1;\n.reg    .pred t0;\n\n// Example 1 :\nbar.sync                      0;\n@t0 mbarrier.init.b64     [addr], %r1;\n// ... other mbarrier operations on addr\nbar.sync                      0;\n@t0 mbarrier.inval.b64    [addr];\n\n\n// Example 2 :\nbar.cta.sync                  0;\nmbarrier.init.shared.b64           [shmem], 12;\n// ... other mbarrier operations on shmem\nbar.cta.sync                  0;\n@t0 mbarrier.inval.shared.b64      [shmem];\n\n// shmem can be reused here for unrelated use :\nbar.cta.sync                  0;\nst.shared.b64                      [shmem], ...;\n\n// shmem can be re-initialized as mbarrier object :\nbar.cta.sync                  0;\n@t0 mbarrier.init.shared.b64       [shmem], 24;\n// ... other mbarrier operations on shmem\nbar.cta.sync                  0;\n@t0 mbarrier.inval.shared::cta.b64 [shmem];\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.pending_count</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-pending-count\">\n\n\n<p>Query the pending arrival count from the opaque mbarrier state.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.pending_count.b64 count, state;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The pending count can be queried from the opaque mbarrier state using <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.pending_count</span></code>.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> operand is a 64-bit register that must be the result of a prior\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive.noComplete</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop.noComplete</span></code> instruction. Otherwise, the\nbehavior is undefined.</p>\n<p>The destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> is a 32-bit unsigned integer representing the pending count of\nthe <em>mbarrier object</em> prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> from\nwhich the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> register was obtained.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b32 %r1;\n.reg .b64 state;\n.shared .b64 shMem;\n\nmbarrier.arrive.noComplete.b64 state, [shMem], 1;\nmbarrier.pending_count.b64 %r1, state;\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait\"></span>\n\n<p>Checks whether the <em>mbarrier object</em> has completed the phase.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mbarrier.test_wait{.sem}{.scope}{.shared{::cta}}.b64        waitComplete, [addr], state;\nmbarrier.test_wait.parity{.sem}{.scope}{.shared{::cta}}.b64 waitComplete, [addr], phaseParity;\n\nmbarrier.try_wait{.sem}{.scope}{.shared{::cta}}.b64         waitComplete, [addr], state\n                                                               {, suspendTimeHint};\n\nmbarrier.try_wait.parity{.sem}{.scope}{.shared{::cta}}.b64  waitComplete, [addr], phaseParity\n                                                               {, suspendTimeHint};\n\n.sem   = { .acquire }\n.scope = { .cta, .cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <em>test_wait</em> and <em>try_wait</em> operations test for the completion of the current or the immediately\npreceding phase of an <em>mbarrier object</em> at the location specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> is a non-blocking instruction which tests for the completion of the phase.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> is a potentially blocking instruction which tests for the completion of the\nphase. If the phase is not complete, the executing thread may be suspended. Suspended thread resumes\nexecution when the specified phase completes OR before the phase completes following a\nsystem-dependent time limit. The optional 32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">suspendTimeHint</span></code>\nspecifies the time limit, in nanoseconds, that may be used for the time limit instead of the\nsystem-dependent limit.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> test for completion of the phase :</p>\n<ul class=\"simple\">\n<li><p>Specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code>, which was returned by an <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> instruction on\nthe same <em>mbarrier object</em> during the current or the immediately preceding phase. Or</p></li>\n<li><p>Indicated by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">phaseParity</span></code>, which is the integer parity of either the current phase\nor the immediately preceding phase of the <em>mbarrier object</em>.</p></li>\n</ul>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.parity</span></code> variant of the instructions test for the completion of the phase indicated by the\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">phaseParity</span></code>, which is the integer parity of either the current phase or the immediately\npreceding phase of the <em>mbarrier object</em>. An even phase has integer parity 0 and an odd phase has\ninteger parity of 1. So the valid values of <code class=\"docutils literal notranslate\"><span class=\"pre\">phaseParity</span></code> operand are 0 and 1.</p>\n<p>Note: the use of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.parity</span></code> variants of the instructions requires tracking the phase of an\n<em>mbarrier object</em> throughout its lifetime.</p>\n<p>The <em>test_wait</em> and <em>try_wait</em> operations are valid only for :</p>\n<ul class=\"simple\">\n<li><p>the current incomplete phase, for which <code class=\"docutils literal notranslate\"><span class=\"pre\">waitComplete</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>.</p></li>\n<li><p>the immediately preceding phase, for which <code class=\"docutils literal notranslate\"><span class=\"pre\">waitComplete</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>.</p></li>\n</ul>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>When <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> operations return <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>, they form the\n<em>acquire</em> pattern as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier indicates the set of threads that the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> instructions can directly synchronize. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not\nspecified then it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code>. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::&lt;scope&gt;</span></code> indicates the state\nspace where the mbarrier resides.</p>\n<p>The following ordering of memory operations hold for the executing thread when\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> :</p>\n<ol class=\"arabic simple\">\n<li><p>All memory accesses (except <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">async operations</a> ) requested prior, in program\norder, to <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> during the completed phase by the participating threads of the CTA\nare performed and are visible to the executing thread.</p></li>\n<li><p>All <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a> operations\nrequested prior, in program order, to <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.mbarrier.arrive</span></code> during the completed phase by\nthe participating threads of the CTA are performed and made visible to the executing thread.</p></li>\n<li><p>All <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> asynchronous operations using the same <em>mbarrier object</em> requested prior,\nin program order, to <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> during the completed phase by the participating threads\nof the CTA are performed and made visible to the executing thread.</p></li>\n<li><p>All memory accesses requested after the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code>, in\nprogram order, are not performed and not visible to memory accesses performed prior to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code>, in program order, by other threads participating in the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier</span></code>.</p></li>\n<li><p>There is no ordering and visibility guarantee for memory accesses requested by the thread after\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> and prior to <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code>, in program order.</p></li>\n</ol>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> introduced in PTX ISA version 7.0.</p>\n<p>Modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.parity</span></code> is introduced in PTX ISA version 7.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifiers introduced in PTX ISA version 8.0</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Example 1a, thread synchronization with test_wait:\n\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nmbarrier.arrive.shared.b64  %r1, [shMem]; // N threads executing mbarrier.arrive\n\n// computation not requiring mbarrier synchronization...\n\nwaitLoop:\nmbarrier.test_wait.shared.b64    complete, [shMem], %r1;\n@!complete nanosleep.u32 20;\n@!complete bra waitLoop;\n\n// Example 1b, thread synchronization with try_wait :\n\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nmbarrier.arrive.shared.b64  %r1, [shMem]; // N threads executing mbarrier.arrive\n\n// computation not requiring mbarrier synchronization...\n\nwaitLoop:\nmbarrier.try_wait.shared.b64    complete, [shMem], %r1;\n@!complete bra waitLoop;\n\n\n// Example 2, thread synchronization using phase parity :\n\n.reg .b32 i, parArg;\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmov.b32 i, 0;\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nloopStart :                           // One phase per loop iteration\n    ...\n    mbarrier.arrive.shared.b64  %r1, [shMem]; // N threads\n    ...\n    and.b32 parArg, i, 1;\n    waitLoop:\n    mbarrier.test_wait.parity.shared.b64  complete, [shMem], parArg;\n    @!complete nanosleep.u32 20;\n    @!complete bra waitLoop;\n    ...\n    add.u32 i, i, 1;\n    setp.lt.u32 p, i, IterMax;\n@p bra loopStart;\n\n\n// Example 3, Asynchronous copy completion waiting :\n\n.reg .b64 state;\n.shared .b64 shMem2;\n.shared .b64 shard1, shard2;\n.global .b64 gbl1, gbl2;\n\nmbarrier.init.shared.b64 [shMem2], threadCount;\n...\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n\n// Absence of .noinc accounts for arrive-on from prior cp.async operation\ncp.async.mbarrier.arrive.shared.b64 [shMem2];\n...\nmbarrier.arrive.shared.b64 state, [shMem2];\n\nwaitLoop:\nmbarrier.test_wait.shared::cta.b64 p, [shMem2], state;\n@!p bra waitLoop;\n\n// Example 4, Synchronizing the CTA0 threads with cluster threads\n.reg .b64 %r1, addr, remAddr;\n.shared .b64 shMem;\n\ncvta.shared.u64          addr, shMem;\nmapa.u64                 remAddr, addr, 0;     // CTA0\u2019s shMem instance\n\n// One thread from CTA0 executing the below initialization operation\n@p0 mbarrier.init.shared::cta.b64 [shMem], N;  // N = no of cluster threads\n\nbarrier.cluster.arrive;\nbarrier.cluster.wait;\n\n// Entire cluster executing the below arrive operation\nmbarrier.arrive.release.cluster.b64              _, [remAddr];\n\n// computation not requiring mbarrier synchronization ...\n\n// Only CTA0 threads executing the below wait operation\nwaitLoop:\nmbarrier.try_wait.parity.acquire.cluser.shared::cta.b64  complete, [shMem], 0;\n@!complete bra waitLoop;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Parallel Synchronization and Communication Instructions: mbarrier\n\n\n\nSynchronizing any subset of threads within a CTA\n\nOne-way synchronization of threads across CTAs of a cluster. As noted in mbarrier support with\n\nshared memory, threads can\n\nperform only arrive operations but not *_wait on an mbarrier located in shared::cluster\n\nspace.\n\nWaiting for completion of asynchronous memory operations initiated by a thread and making them\n\nvisible to other threads.\n\nAn mbarrier o...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.arrive\n\n\n\nPerforms arrive-on operation on the\n\nmbarrier object.\n\nSyntax\n\nmbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64           state, [addr]{, count};\n\nmbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64         _, [addr] {,count}\n\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], txCount;\n\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64   _, [addr], txCount;\n\nmbarrier....\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop\n\n\n\nDecrements the expected count of the mbarrier object and performs arrive-on operation.\n\nSyntax\n\nmbarrier.arrive_drop{.sem}{.scope}{.shared{::cta}}.b64 state,           [addr]{, count};\n\nmbarrier.arrive_drop{.sem}{.scope}{.shared::cluster}.b64           _,   [addr] {,count};\n\nmbarrier.arrive_drop.expect_tx{.shared{::cta}}{.sem}{.scope}.b64 state, [addr], tx_count;\n\nmbarrier.arrive_drop.expect_tx{.shared...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.complete_tx\n\n\n\nPerfoms complete-tx\n\noperation on the mbarrier object.\n\nSyntax\n\nmbarrier.complete_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n\n.scope = { .cta, .cluster }\n\n.space = { .shared{::cta}, .shared::cluster }\n\nDescription\n\nA thread executing mbarrier.complete_tx performs a complete-tx\n\noperation on the mbarrier object at the location specified by the address operand addr. The\n\n32-bit unsig...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.expect_tx\n\n\n\nPerfoms expect-tx operation on the mbarrier object.\n\nSyntax\n\nmbarrier.expect_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n\n.scope = { .cta, .cluster }\n\n.space = { .shared{::cta}, .shared::cluster }\n\nDescription\n\nA thread executing mbarrier.expect_tx performs an expect-tx\n\noperation on the mbarrier object at the location specified by the address operand addr. The\n\n32-bit unsigned int...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.init\n\n\n\nInitialize the mbarrier object.\n\nSyntax\n\nmbarrier.init{.shared{::cta}}.b64 [addr], count;\n\nDescription\n\nmbarrier.init initializes the mbarrier object at the location specified by the address operand\n\naddr with the unsigned 32-bit integer count. The value of operand count must be in the range\n\nas specified in Contents of the mbarrier object.\n\nInitialization of the mbarrier object involves :\n\nInitializing t...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.inval\n\n\n\nInvalidates the mbarrier object.\n\nSyntax\n\nmbarrier.inval{.shared{::cta}}.b64 [addr];\n\nDescription\n\nmbarrier.inval invalidates the mbarrier object at the location specified by the address\n\noperand addr.\n\nAn mbarrier object must be invalidated before using its memory location for any other purpose.\n\nPerforming any mbarrier operation except mbarrier.init on an invalidated mbarrier object\n\nresults in undefine...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.pending_count\n\n\n\nQuery the pending arrival count from the opaque mbarrier state.\n\nSyntax\n\nmbarrier.pending_count.b64 count, state;\n\nDescription\n\nThe pending count can be queried from the opaque mbarrier state using mbarrier.pending_count.\n\nThe state operand is a 64-bit register that must be the result of a prior\n\nmbarrier.arrive.noComplete or mbarrier.arrive_drop.noComplete instruction. Otherwise, the\n\nbehavior is undefi...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait\n\n\n\nChecks whether the mbarrie ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier"
            };

        case "membar/fence":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence\" target=\"_blank\" rel=\"noopener noreferrer\">membar/fence <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: membar/fence</h1><section id=\"parallel-synchronization-and-communication-instructions-membar-fence\">\n<span id=\"parallel-synchronization-and-communication-instructions-membar\"></span>\n\n<p>Enforce an ordering of memory operations.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Thread fence :\nfence{.sem}.scope;\n\n// Operation fence :\nfence.op_restrict.release.cluster;\n\n// Proxy fence (bi-directional) :\nfence.proxy.proxykind;\n\n// Proxy fence (uni-directional) :\nfence.proxy.to_proxykind::from_proxykind.release.scope;\nfence.proxy.to_proxykind::from_proxykind.acquire.scope  [addr], size;\n\n// Old style membar :\nmembar.level;\nmembar.proxy.proxykind;\n\n.sem       = { .sc, .acq_rel };\n.scope     = { .cta, .cluster, .gpu, .sys };\n.level     = { .cta, .gl, .sys };\n.proxykind = { .alias, .async, async.global, .async.shared::{cta, cluster} };\n.op_restrict = { .mbarrier_init };\n.to_proxykind::from_proxykind = {.tensormap::generic};\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">membar</span></code> instruction guarantees that prior memory accesses requested by this thread (<code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> instructions) are performed at the specified <code class=\"docutils literal notranslate\"><span class=\"pre\">level</span></code>, before later\nmemory operations requested by this thread following the <code class=\"docutils literal notranslate\"><span class=\"pre\">membar</span></code> instruction. The <code class=\"docutils literal notranslate\"><span class=\"pre\">level</span></code>\nqualifier specifies the set of threads that may observe the ordering effect of this operation.</p>\n<p>A memory read (e.g., by <code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code>) has been performed when the value read has been\ntransmitted from memory and cannot be modified by another thread at the indicated level. A memory\nwrite (e.g., by <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code>) has been performed when the value written has become\nvisible to other threads at the specified level, that is, when the previous value can no longer be\nread.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">fence</span></code> instruction establishes an ordering between memory accesses requested by this thread\n(<code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> instructions) as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. The scope qualifier specifies the set of threads that may\nobserve the ordering effect of this operation.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence.acq_rel</span></code> is a light-weight fence that is sufficient for memory synchronization in most\nprograms. Instances of <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.acq_rel</span></code> synchronize when combined with additional memory operations\nas described in <code class=\"docutils literal notranslate\"><span class=\"pre\">acquire</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">release</span></code> patterns in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier is absent, <code class=\"docutils literal notranslate\"><span class=\"pre\">.acq_rel</span></code>\nis assumed by default.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence.sc</span></code> is a slower fence that can restore <em>sequential consistency</em> when used in sufficient\nplaces, at the cost of performance. Instances of <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.sc</span></code> with sufficient scope always\nsynchronize by forming a total order per scope, determined at runtime. This total order can be\nconstrained further by other synchronization in the program.</p>\n<p>Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.op_restrict</span></code> restricts the class of prior memory operations for which the <code class=\"docutils literal notranslate\"><span class=\"pre\">fence</span></code>\ninstruction provides the memory ordering guarantees. When <code class=\"docutils literal notranslate\"><span class=\"pre\">.op_restrict</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier_init</span></code>,\nthe fence only applies to the prior <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code> operations executed by the same thread on\n<em>mbarrier objects</em> in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space.</p>\n<p>The address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> and the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> together specifies the memory range\n<code class=\"docutils literal notranslate\"><span class=\"pre\">[addr,</span> <span class=\"pre\">addr+size-1]</span></code> on which the ordering guarantees on the memory accesses across the proxies is to be\nprovided. The only supported value for the <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> operand is 128. <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a>\nis used unconditionally, and the address specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> must fall within the <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code>\nstate space. Otherwise, the behavior is undefined.</p>\n<p>On <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> and higher <code class=\"docutils literal notranslate\"><span class=\"pre\">membar</span></code> is a synonym for <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.sc</span></code><sup>1</sup>, and the <code class=\"docutils literal notranslate\"><span class=\"pre\">membar</span></code>\nlevels <code class=\"docutils literal notranslate\"><span class=\"pre\">cta</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gl</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">sys</span></code> are synonymous with the <code class=\"docutils literal notranslate\"><span class=\"pre\">fence</span></code> scopes <code class=\"docutils literal notranslate\"><span class=\"pre\">cta</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gpu</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sys</span></code> respectively.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.proxy</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.proxy</span></code> instructions establish an ordering between memory accesses that\nmay happen through different <em>proxies</em>.</p>\n<p>A <em>uni-directional</em> proxy ordering from the <em>from-proxykind</em> to the <em>to-proxykind</em> establishes\nordering between a prior memory access performed via the <em>from-proxykind</em> and a subsequent memory access\nperformed via the <em>to-proxykind</em>.</p>\n<p>A <em>bi-directional</em> proxy ordering between two proxykinds establishes two <em>uni-directional</em> proxy orderings\n: one from the first proxykind to the second proxykind and the other from the second proxykind to the first\nproxykind.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.proxykind</span></code> qualifier indicates the <em>bi-directional</em> proxy ordering that is established between the memory\naccesses done between the generic proxy and the proxy specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.proxykind</span></code>.</p>\n<p>Value <code class=\"docutils literal notranslate\"><span class=\"pre\">.alias</span></code> of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.proxykind</span></code> qualifier refers to memory accesses performed using virtually\naliased addresses to the same memory location. Value <code class=\"docutils literal notranslate\"><span class=\"pre\">.async</span></code> of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.proxykind</span></code> qualifier specifies\nthat the memory ordering is established between the async proxy and the generic proxy. The memory\nordering is limited only to the state space specified. If no state space is specified, then the memory\nordering applies on all state spaces.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> proxy fence can form a release sequence that synchronizes with an acquire\nsequence that contains a <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code> proxy fence. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.to_proxykind</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.from_proxykind</span></code> qualifiers indicate the <em>uni-directional</em> proxy ordering that is established.</p>\n<p>On <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> and higher, <code class=\"docutils literal notranslate\"><span class=\"pre\">membar.proxy</span></code> is a synonym for <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.proxy</span></code>.</p>\n<p><sup>1</sup> The semantics of <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.sc</span></code> introduced with <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> is a superset of the semantics of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membar</span></code> and the two are compatible; when executing on <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or later architectures,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membar</span></code> acquires the full semantics of <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.sc</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.{cta,gl}</span></code> introduced in PTX ISA version 1.4.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.sys</span></code> introduced in PTX ISA version 2.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence</span></code> introduced in PTX ISA version 6.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.proxy</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.proxy</span></code> introduced in PTX ISA version 7.5.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier introduced in PTX ISA version 7.8.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.op_restrict</span></code> qualifier introduced in PTX ISA version 8.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence.proxy.async</span></code> is introduced in PTX ISA version 8.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.to_proxykind::from_proxykind</span></code> qualifier introduced in PTX ISA version 8.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.{cta,gl}</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.sys</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.proxy</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence.proxy</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.op_restrict</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence.proxy.async</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.to_proxykind::from_proxykind</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>membar.gl;\nmembar.cta;\nmembar.sys;\nfence.sc;\nfence.sc.cluster;\nfence.proxy.alias;\nmembar.proxy.alias;\nfence.mbarrier_init.release.cluster;\nfence.proxy.async;\nfence.proxy.async.shared::cta;\nfence.proxy.async.shared::cluster;\nfence.proxy.async.global;\n\ntensormap.replace.tile.global_address.global.b1024.b64   [gbl], new_addr;\nfence.proxy.tensormap::generic.release.gpu;\nfence.proxy.tensormap::generic.acquire.gpu [tmap], 128;\ncvta.global.u64  tmap, gbl;\ncp.async.bulk.tensor.1d.shared::cluster.global.tile  [addr0], [tmap, {tc0}], [mbar0];\n</pre></div>\n</div>\n</section>",
                "tooltip": "Enforce an ordering of memory operations.\n\nSyntax\n\n// Thread fence :\n\nfence{.sem}.scope;\n\n// Operation fence :\n\nfence.op_restrict.release.cluster;\n\n// Proxy fence (bi-directional) :\n\nfence.proxy.proxykind;\n\n// Proxy fence (uni-directional) :\n\nfence.proxy.to_proxykind::from_proxykind.release.scope;\n\nfence.proxy.to_proxykind::from_proxykind.acquire.scope  [addr], size;\n\n// Old style membar :\n\nmembar.level;\n\nmembar.proxy.proxykind;\n\n.sem       = { .sc, .acq_rel };\n\n.scope     = { .cta, .cluster, .gpu, .sys };\n\n.level     = { .cta, .gl, .sys };\n\n.proxykind = { .alias, .async, async.global, .async.shared::{cta, cluster} };\n\n.op_restrict = { .mbarrier_init };\n\n.to_proxykind::from_proxykind = {.tensormap::generic};\n\nDescription\n\nThe membar instruction guarantees that prior memory accesses requested by this thread (ld,\n\nst, atom and red instructions) are performed at the specified level, before later\n\nmemory operations requested by this thread following the membar instruction. The level\n\nqualifier specifies the set of threads that may observe the ordering effect of this operation.\n\nA memory read (e.g., by ld or atom) has been performed when the value read has been\n\ntransmitted from memory and cannot be modified by another thread at the indicated level. A memory\n\nwrite (e.g., by st, red or atom) has been performed when the value written has become\n\nvisible to other threads at the specified level, that is, when the previous value can no longer be\n\nread.\n\nThe fence instruction establishes an ordering between memory accesses requested by this thread\n\n(ld, st, atom and red instructions) as described in the Memory Consistency Model. The scope qualifier specifies the set of threads that may\n\nobserve the ordering effect of this operation.\n\nfence.acq_rel is a light-weight fence that is sufficient for memory synchronization in most\n\nprograms. Instances of fence.acq_rel synchronize when combined with additional memory operations\n\nas described in acquire and release patterns in the Memory Consistency Model. If the optional .sem qualifier is absent, .acq_rel\n\nis assumed by default.\n\nfence.sc is a slower fence that can restore sequential consistency when used in sufficient\n\nplaces, at the cost of performance. Instances of fence.sc with sufficient scope always\n\nsynchronize by forming a total order per scope, determined at runtime. This total order can be\n\nconstrained further by other synchronization in the program.\n\nQualifier .op_restrict restricts the class of prior memory operations for which the fence\n\ninstruction provides the memory ordering guarantees. When .op_restrict is .mbarrier_init,\n\nthe fence only applies to the prior mbarrier.init operations executed by the same thread on\n\nmbarrier objects in .shared::cta state space.\n\nThe address operand addr and the operand size together specifies the memory range\n\n[addr, addr+size-1] on which the ordering guarantees on the memory accesses across the proxies is to be\n\nprovided. The only supported value for the size operand is 128. Generic Addressing\n\nis used unconditionally, and the address specified by the operand addr must fall within the .global\n\nstate space. Otherwise, the behavior is undefined.\n\nOn sm_70 and higher membar is a synonym for fence.sc1, and the membar\n\nlevels cta, gl and sys are synonymous with the fence scopes cta, gpu and\n\nsys respectively.\n\nmembar.proxy and fence.proxy instructions establish an ordering between memory accesses that\n\nmay happen through different proxies.\n\nA uni-directional proxy ordering from the from-proxykind to the to-proxykind establishes\n\nordering between a prior memory access performed via the from-proxykind and a subsequent memory access\n\nperformed via the to-proxykind.\n\nA bi-directional proxy ordering between two proxykinds establishes two uni-directional proxy orderings\n\n: one from the first proxykind to the second proxykind and the other from the second proxykind to the first\n\nproxykind.\n\nThe .proxykind qualifier indicates the bi-directional proxy ordering that is established between the memory\n\naccesses done betwee ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence"
            };

        case "min":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min\" target=\"_blank\" rel=\"noopener noreferrer\">min(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min\" target=\"_blank\" rel=\"noopener noreferrer\">min(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min\" target=\"_blank\" rel=\"noopener noreferrer\">min(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: min</h1><section id=\"floating-point-instructions-min\">\n\n\n<p>Find the minimum of two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>min{.ftz}{.NaN}{.xorsign.abs}.f32  d, a, b;\nmin.f64                            d, a, b;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Store the minimum of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.NaN</span></code> modifier is specified, then the result is canonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> if either of the inputs is\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifier is specified, the magnitude of destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is the minimum of\nabsolute values of both the input arguments.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> modifier is specified, the sign bit of destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is equal to the XOR of the\nsign bits of both the inputs.</p>\n<p>Modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> must be specified together and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> considers the sign\nbit of both inputs before applying <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> operation.</p>\n<p>If the result of <code class=\"docutils literal notranslate\"><span class=\"pre\">min</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> then the <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifiers will be ignored.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (.xorsign) {\n    xorsign = getSignBit(a) ^ getSignBit(b);\n    if (.abs) {\n        a = |a|;\n        b = |b|;\n   }\n}\nif (isNaN(a) &amp;&amp; isNaN(b))                 d = NaN;\nelse if (.NaN &amp;&amp; (isNaN(a) || isNaN(b)))  d = NaN;\nelse if (isNaN(a))                        d = b;\nelse if (isNaN(b))                        d = a;\nelse                                      d = (a &lt; b) ? a : b;\nif (.xorsign &amp;&amp; !isNaN(d)) {\n    setSignBit(d, xorsign);\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>If values of both inputs are 0.0, then +0.0 &gt; -0.0.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.NaN</span></code>introduced in PTX ISA version 7.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.xorsign.abs</span></code> introduced in PTX ISA version 7.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.NaN</span></code>requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.xorsign.abs</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_86</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>@p  min.ftz.f32  z,z,x;\n    min.f64      a,b,c;\n    // fp32 min with .NaN\n    min.NaN.f32  f0,f1,f2;\n    // fp32 min with .xorsign.abs\n    min.xorsign.abs.f32 Rd, Ra, Rb;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: min</h1><section id=\"half-precision-floating-point-instructions-min\">\n\n\n<p>Find the minimum of two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>min{.ftz}{.NaN}{.xorsign.abs}.f16      d, a, b;\nmin{.ftz}{.NaN}{.xorsign.abs}.f16x2    d, a, b;\nmin{.NaN}{.xorsign.abs}.bf16           d, a, b;\nmin{.NaN}{.xorsign.abs}.bf16x2         d, a, b;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Store the minimum of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction types, input vectors are formed with half-word values\nfrom source operands. Half-word operands are then processed in parallel to store <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> result in destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction\ntype, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.NaN</span></code> modifier is specified, then the result is canonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> if either of the inputs is\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifier is specified, the magnitude of destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is the minimum of\nabsolute values of both the input arguments.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> modifier is specified, the sign bit of destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is equal to the XOR of the\nsign bits of both the inputs.</p>\n<p>Modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> must be specified together and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> considers the sign\nbit of both inputs before applying <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> operation.</p>\n<p>If the result of <code class=\"docutils literal notranslate\"><span class=\"pre\">min</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> then the <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifiers will be ignored.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    if (.xorsign) {\n        xorsign = getSignBit(a) ^ getSignBit(b);\n        if (.abs) {\n            a = |a|;\n            b = |b|;\n        }\n    }\n    if (isNaN(a) &amp;&amp; isNaN(b))              d = NaN;\n    if (.NaN &amp;&amp; (isNaN(a) || isNaN(b)))    d = NaN;\n    else if (isNaN(a))                     d = b;\n    else if (isNaN(b))                     d = a;\n    else                                   d = (a &lt; b) ? a : b;\n    if (.xorsign &amp;&amp; !isNaN(d)) {\n         setSignBit(d, xorsign);\n    }\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n        if (.xorsign) {\n            xorsign = getSignBit(fA[i]) ^ getSignBit(fB[i]);\n            if (.abs) {\n               fA[i] = |fA[i]|;\n               fB[i] = |fB[i]|;\n           }\n        }\n        if (isNaN(fA[i]) &amp;&amp; isNaN(fB[i]))              d[i] = NaN;\n        if (.NaN &amp;&amp; (isNaN(fA[i]) || isNaN(fB[i])))    d[i] = NaN;\n        else if (isNaN(fA[i]))                         d[i] = fB[i];\n        else if (isNaN(fB[i]))                         d[i] = fA[i];\n        else                                           d[i] = (fA[i] &lt; fB[i]) ? fA[i] : fB[i];\n        if (.xorsign &amp;&amp; !isNaN(d[i])) {\n            setSignBit(d[i], xorsign);\n        }\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<dl class=\"simple\">\n<dt>Subnormal numbers:</dt>\n<dd>\n<p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">min.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>If values of both inputs are 0.0, then +0.0 &gt; -0.0.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.xorsign</span></code> introduced in PTX ISA version 7.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.xorsign.abs</span></code> support requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_86</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>min.ftz.f16       h0,h1,h2;\nmin.f16x2         b0,b1,b2;\n// SIMD fp16 min with .NaN\nmin.NaN.f16x2     b0,b1,b2;\nmin.bf16          h0, h1, h2;\n// SIMD bf16 min with NaN\nmin.NaN.bf16x2    b0, b1, b2;\n// scalar bf16 min with xorsign.abs\nmin.xorsign.abs.bf16 Rd, Ra, Rb\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: min</h1><section id=\"integer-arithmetic-instructions-min\">\n\n\n<p>Find the minimum of two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>min.atype         d, a, b;\nmin{.relu}.btype  d, a, b;\n\n.atype = { .u16, .u32, .u64,\n           .u16x2, .s16, .s64 };\n.btype = { .s16x2, .s32 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Store the minimum of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code> instruction types, forms input vectors by half word values from source\noperands. Half-word operands are then processed in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code> result\nin destination.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have the same type as the instruction type. For instruction types\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code>, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (type == u16x2 || type == s16x2) {\n    iA[0] = a[0:15];\n    iA[1] = a[16:31];\n    iB[0] = b[0:15];\n    iB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = (iA[i] &lt; iB[i]) ? iA[i] : iB[i];\n    }\n} else {\n    d = (a &lt; b) ? a : b; // Integer (signed and unsigned)\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Signed and unsigned differ.</p>\n<dl class=\"simple\">\n<dt>Saturation modifier:</dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.relu.{s16x2,</span> <span class=\"pre\">s32}</span></code> clamps the result to 0 if negative.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">min{.relu}.s16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">min.relu.s32</span></code> introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">min{.relu}.s16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">min.relu.s32</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>    min.s32  r0,a,b;\n@p  min.u16  h,i,j;\n    min.s16x2.relu u,v,w;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: min\n\n\n\nFind the minimum of two values.\n\nSyntax\n\nmin{.ftz}{.NaN}{.xorsign.abs}.f32  d, a, b;\n\nmin.f64                            d, a, b;\n\nDescription\n\nStore the minimum of a and b in d.\n\nIf .NaN modifier is specified, then the result is canonical NaN if either of the inputs is\n\nNaN.\n\nIf .abs modifier is specified, the magnitude of destination operand d is the minimum of\n\nabsolute values of both the input argument...\n\n=====Half Precision Floating Point Instructions: min\n\n\n\nFind the minimum of two values.\n\nSyntax\n\nmin{.ftz}{.NaN}{.xorsign.abs}.f16      d, a, b;\n\nmin{.ftz}{.NaN}{.xorsign.abs}.f16x2    d, a, b;\n\nmin{.NaN}{.xorsign.abs}.bf16           d, a, b;\n\nmin{.NaN}{.xorsign.abs}.bf16x2         d, a, b;\n\nDescription\n\nStore the minimum of a and b in d.\n\nFor .f16x2 and .bf16x2 instruction types, input vectors are formed with half-word values\n\nfrom source operands. Half-word o...\n\n=====Integer Arithmetic Instructions: min\n\n\n\nFind the minimum of two values.\n\nSyntax\n\nmin.atype         d, a, b;\n\nmin{.relu}.btype  d, a, b;\n\n.atype = { .u16, .u32, .u64,\n\n           .u16x2, .s16, .s64 };\n\n.btype = { .s16x2, .s32 };\n\nDescription\n\nStore the minimum of a and b in d.\n\nFor .u16x2, .s16x2 instruction types, forms input vectors by half word values from source\n\noperands. Half-word operands are then processed in parallel to produce .u16x2, .s... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min"
            };

        case "minnctapersm":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-minnctapersm\" target=\"_blank\" rel=\"noopener noreferrer\">minnctapersm <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Performance-Tuning Directives: .minnctapersm</h1><section id=\"performance-tuning-directives-minnctapersm\">\n\n\n<p>Minimum number of CTAs per SM.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.minnctapersm ncta\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declare the minimum number of CTAs from the kernel\u2019s grid to be mapped to a single multiprocessor\n(SM).</p>\n<p><strong>Notes</strong></p>\n<p>Optimizations based on <code class=\"docutils literal notranslate\"><span class=\"pre\">.minnctapersm</span></code> need either <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxntid</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.reqntid</span></code> to be specified as\nwell.</p>\n<p>If the total number of threads on a single SM resulting from <code class=\"docutils literal notranslate\"><span class=\"pre\">.minnctapersm</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxntid</span></code> /\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.reqntid</span></code> exceed maximum number of threads supported by an SM then directive <code class=\"docutils literal notranslate\"><span class=\"pre\">.minnctapersm</span></code>\nwill be ignored.</p>\n<p>In PTX ISA version 2.1 or higher, a warning is generated if <code class=\"docutils literal notranslate\"><span class=\"pre\">.minnctapersm</span></code> is specified without\nspecifying either <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxntid</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.reqntid</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0 as a replacement for <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxnctapersm</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.entry foo .maxntid 256 .minnctapersm 4 { ... }\n</pre></div>\n</div>\n</section>",
                "tooltip": "Minimum number of CTAs per SM.\n\nSyntax\n\n.minnctapersm ncta\n\nDescription\n\nDeclare the minimum number of CTAs from the kernel\u2019s grid to be mapped to a single multiprocessor\n\n(SM).\n\nNotes\n\nOptimizations based on .minnctapersm need either .maxntid or .reqntid to be specified as\n\nwell.\n\nIf the total number of threads on a single SM resulting from .minnctapersm and .maxntid /\n\n.reqntid exceed maximum number of threads supported by an SM then directive .minnctapersm\n\nwill be ignored.\n\nIn PTX ISA version 2.1 or higher, a warning is generated if .minnctapersm is specified without\n\nspecifying either .maxntid or .reqntid.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0 as a replacement for .maxnctapersm.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry foo .maxntid 256 .minnctapersm 4 { ... }\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-minnctapersm"
            };

        case "mov":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov\" target=\"_blank\" rel=\"noopener noreferrer\">mov <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2\" target=\"_blank\" rel=\"noopener noreferrer\">mov <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: mov</h1><section id=\"data-movement-and-conversion-instructions-mov\">\n\n\n<p>Set a register variable with the value of a register variable or an immediate value. Take the\nnon-generic address of a variable in global, local, or shared state space.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.type  d, a;\nmov.type  d, sreg;\nmov.type  d, avar;       // get address of variable\nmov.type  d, avar+imm;   // get address of variable with offset\nmov.u32   d, fname;      // get address of device function\nmov.u64   d, fname;      // get address of device function\nmov.u32   d, kernel;     // get address of entry function\nmov.u64   d, kernel;     // get address of entry function\n\n.type = { .pred,\n          .b16, .b32, .b64,\n          .u16, .u32, .u64,\n          .s16, .s32, .s64,\n                .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Write register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> with the value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> may be a register, special register, variable with optional offset in an addressable\nmemory space, or function name.</p>\n<p>For variables declared in <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.local</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state spaces, <code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code>\nplaces the non-generic address of the variable (i.e., the address of the variable in its state\nspace) into the destination register. The generic address of a variable in <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">global</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">shared</span></code> state space may be generated by first taking the address within the state\nspace with <code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code> and then converting it to a generic address using the <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta</span></code> instruction;\nalternately, the generic address of a variable declared in <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">local</span></code>, or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">shared</span></code> state space may be taken directly using the <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta</span></code> instruction.</p>\n<p>Note that if the address of a device function parameter is moved to a register, the parameter will\nbe copied onto the stack and the address will be in the local state space.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a;\nd = sreg;\nd = &amp;avar;        // address is non-generic; i.e., within the variable's declared state space\nd = &amp;avar+imm;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<ul class=\"simple\">\n<li><p>Although only predicate and bit-size types are required, we include the arithmetic types for the\nprogrammer\u2019s convenience: their use enhances program readability and allows additional type\nchecking.</p></li>\n<li><p>When moving address of a kernel or a device function, only <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> instruction types\nare allowed. However, if a signed type is used, it is not treated as a compilation error. The\ncompiler issues a warning in this case.</p></li>\n</ul>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p>Taking the address of kernel entry functions requires PTX ISA version 3.1 or later. Kernel function\naddresses should only be used in the context of CUDA Dynamic Parallelism system calls. See the <em>CUDA\nDynamic Parallelism Programming Guide</em> for details.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mov.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>Taking the address of kernel entry functions requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_35</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.f32  d,a;\nmov.u16  u,v;\nmov.f32  k,0.1;\nmov.u32  ptr, A;        // move address of A into ptr\nmov.u32  ptr, A[5];     // move address of A[5] into ptr\nmov.u32  ptr, A+20;     // move address with offset into ptr\nmov.u32  addr, myFunc;  // get address of device function 'myFunc'\nmov.u64  kptr, main;    // get address of entry function 'main'\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: mov</h1><section id=\"data-movement-and-conversion-instructions-mov-2\">\n<span id=\"id6\"></span>\n\n<p>Move vector-to-scalar (pack) or scalar-to-vector (unpack).</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.type  d, a;\n\n.type = { .b16, .b32, .b64, .b128 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Write scalar register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> with the packed value of vector register <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, or write vector register\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> with the unpacked values from scalar register <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>When destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is a vector register, the sink symbol <code class=\"docutils literal notranslate\"><span class=\"pre\">'_'</span></code> may be used for one or\nmore elements provided that at least one element is a scalar register.</p>\n<p>For bit-size types, <code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code> may be used to pack vector elements into a scalar register or unpack\nsub-fields of a scalar register into a vector. Both the overall size of the vector and the size of\nthe scalar must match the size of the instruction type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// pack two 8-bit elements into .b16\nd = a.x | (a.y &lt;&lt; 8)\n// pack four 8-bit elements into .b32\nd = a.x | (a.y &lt;&lt; 8)  | (a.z &lt;&lt; 16) | (a.w &lt;&lt; 24)\n// pack two 16-bit elements into .b32\nd = a.x | (a.y &lt;&lt; 16)\n// pack four 16-bit elements into .b64\nd = a.x | (a.y &lt;&lt; 16)  | (a.z &lt;&lt; 32) | (a.w &lt;&lt; 48)\n// pack two 32-bit elements into .b64\nd = a.x | (a.y &lt;&lt; 32)\n// pack four 32-bit elements into .b128\nd = a.x | (a.y &lt;&lt; 32)  | (a.z &lt;&lt; 64) | (a.w &lt;&lt; 96)\n// pack two 64-bit elements into .b128\nd = a.x | (a.y &lt;&lt; 64)\n\n// unpack 8-bit elements from .b16\n{ d.x, d.y } = { a[0..7], a[8..15] }\n// unpack 8-bit elements from .b32\n{ d.x, d.y, d.z, d.w }\n        { a[0..7], a[8..15], a[16..23], a[24..31] }\n\n// unpack 16-bit elements from .b32\n{ d.x, d.y }  = { a[0..15], a[16..31] }\n// unpack 16-bit elements from .b64\n{ d.x, d.y, d.z, d.w } =\n        { a[0..15], a[16..31], a[32..47], a[48..63] }\n\n// unpack 32-bit elements from .b64\n{ d.x, d.y } = { a[0..31], a[32..63] }\n\n// unpack 32-bit elements from .b128\n{ d.x, d.y, d.z, d.w } =\n        { a[0..31], a[32..63], a[64..95], a[96..127] }\n// unpack 64-bit elements from .b128\n{ d.x, d.y } = { a[0..63], a[64..127] }\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.b128</span></code> type introduced in PTX ISA version 8.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.b128</span></code> type requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.b32 %r1,{a,b};      // a,b have type .u16\nmov.b64 {lo,hi}, %x;    // %x is a double; lo,hi are .u32\nmov.b32 %r1,{x,y,z,w};  // x,y,z,w have type .b8\nmov.b32 {r,g,b,a},%r1;  // r,g,b,a have type .u8\nmov.b64 {%r1, _}, %x;   // %x is.b64, %r1 is .b32\nmov.b128 {%b1, %b2}, %y;   // %y is.b128, %b1 and % b2 are .b64\nmov.b128 %y, {%b1, %b2};   // %y is.b128, %b1 and % b2 are .b64\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Data Movement and Conversion Instructions: mov\n\n\n\nSet a register variable with the value of a register variable or an immediate value. Take the\n\nnon-generic address of a variable in global, local, or shared state space.\n\nSyntax\n\nmov.type  d, a;\n\nmov.type  d, sreg;\n\nmov.type  d, avar;       // get address of variable\n\nmov.type  d, avar+imm;   // get address of variable with offset\n\nmov.u32   d, fname;      // get address of device function\n\nmov.u64   d, f...\n\n=====Data Movement and Conversion Instructions: mov\n\n\n\nMove vector-to-scalar (pack) or scalar-to-vector (unpack).\n\nSyntax\n\nmov.type  d, a;\n\n.type = { .b16, .b32, .b64, .b128 };\n\nDescription\n\nWrite scalar register d with the packed value of vector register a, or write vector register\n\nd with the unpacked values from scalar register a.\n\nWhen destination operand d is a vector register, the sink symbol '_' may be used for one or\n\nmore elements provided that at le... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov"
            };

        case "mul":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul\" target=\"_blank\" rel=\"noopener noreferrer\">mul(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul\" target=\"_blank\" rel=\"noopener noreferrer\">mul(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mul\" target=\"_blank\" rel=\"noopener noreferrer\">mul(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: mul</h1><section id=\"floating-point-instructions-mul\">\n\n\n<p>Multiply two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mul{.rnd}{.ftz}{.sat}.f32  d, a, b;\nmul{.rnd}.f64              d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the product of two values.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a * b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>For floating-point multiplication, all operands must be the same size.</p>\n<p>Rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt>\n<dd>\n<p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt>\n<dd>\n<p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>The default value of rounding modifier is <code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>. Note that a <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code> instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. A <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code> instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, <code class=\"docutils literal notranslate\"><span class=\"pre\">mul/add</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mul/sub</span></code> sequences with no rounding modifiers may be\noptimized to use fused-multiply-add instructions on the target device.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>Saturation modifier:</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.sat.f32</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>Rounding modifiers have the following target requirements:</p>\n<dl>\n<dt>\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code>\n</dt>\n<dd>\n<p>available for all targets</p>\n</dd>\n<dt>\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code>\n</dt>\n<dd>\n<p>for <code class=\"docutils literal notranslate\"><span class=\"pre\">mul.f64</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>for <code class=\"docutils literal notranslate\"><span class=\"pre\">mul.f32</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n</dd>\n</dl>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mul.ftz.f32 circumf,radius,pi  // a single-precision multiply\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: mul</h1><section id=\"half-precision-floating-point-instructions-mul\">\n\n\n<p>Multiply two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mul{.rnd}{.ftz}{.sat}.f16   d, a, b;\nmul{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nmul{.rnd}.bf16   d, a, b;\nmul{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs multiplication and writes the resulting value into a destination register.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, forms input vectors by half word values from source\noperands. Half-word operands are then multiplied in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>\nresult in destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>\ninstruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type,\noperands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    d = a * b;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = fA[i] * fB[i];\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt>\n<dd>\n<p>mantissa LSB rounds to nearest even</p>\n</dd>\n</dl>\n<p>The default value of rounding modifier is <code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>. Note that a <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code> instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. A <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code> instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mul/sub</span></code> sequences with no rounding modifiers may\nbe optimized to use fused-multiply-add instructions on the target device.</p>\n<dl class=\"simple\">\n<dt>Subnormal numbers:</dt>\n<dd>\n<p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mul.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt>Saturation modifier:</dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.sat.{f16,</span> <span class=\"pre\">f16x2}</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul{.rnd}.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mul{.rnd}.bf16x2</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul{.rnd}.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mul{.rnd}.bf16x2</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// scalar f16 multiplications\nmul.f16        d0, a0, b0;\nmul.rn.f16     d1, a1, b1;\nmul.bf16       bd0, ba0, bb0;\nmul.rn.bf16    bd1, ba1, bb1;\n\n// SIMD f16 multiplication\ncvt.rn.f16.f32 h0, f0;\ncvt.rn.f16.f32 h1, f1;\ncvt.rn.f16.f32 h2, f2;\ncvt.rn.f16.f32 h3, f3;\nmov.b32  p1, {h0, h1};   // pack two f16 to 32bit f16x2\nmov.b32  p2, {h2, h3};   // pack two f16 to 32bit f16x2\nmul.f16x2  p3, p1, p2;   // SIMD f16x2 multiplication\n\n// SIMD bf16 multiplication\ncvt.rn.bf16x2.f32 p4, f4, f5; // Convert two f32 into packed bf16x2\ncvt.rn.bf16x2.f32 p5, f6, f7; // Convert two f32 into packed bf16x2\nmul.bf16x2  p6, p4, p5;       // SIMD bf16x2 multiplication\n\n// SIMD fp16 multiplication\nld.global.b32   f0, [addr];     // load 32 bit which hold packed f16x2\nld.global.b32   f1, [addr + 4]; // load 32 bit which hold packed f16x2\nmul.f16x2       f2, f0, f1;     // SIMD f16x2 multiplication\n\n// SIMD bf16 multiplication\nld.global.b32   f3, [addr + 8];  // load 32 bit which hold packed bf16x2\nld.global.b32   f4, [addr + 12]; // load 32 bit which hold packed bf16x2\nmul.bf16x2      f5, f3, f4;      // SIMD bf16x2 multiplication\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: mul</h1><section id=\"integer-arithmetic-instructions-mul\">\n\n\n<p>Multiply two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mul.mode.type  d, a, b;\n\n.mode = { .hi, .lo, .wide };\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the product of two values.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>t = a * b;\nn = bitwidth of type;\nd = t;            // for .wide\nd = t&lt;2n-1..n&gt;;   // for .hi variant\nd = t&lt;n-1..0&gt;;    // for .lo variant\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The type of the operation represents the types of the <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> operands. If <code class=\"docutils literal notranslate\"><span class=\"pre\">.hi</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.lo</span></code> is specified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is the same size as <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and either the upper or lower\nhalf of the result is written to the destination register. If <code class=\"docutils literal notranslate\"><span class=\"pre\">.wide</span></code> is specified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is\ntwice as wide as <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to receive the full result of the multiplication.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.wide</span></code> suffix is supported only for 16- and 32-bit integer types.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mul.wide.s16 fa,fxs,fys;   // 16*16 bits yields 32 bits\nmul.lo.s16 fa,fxs,fys;     // 16*16 bits, save only the low 16 bits\nmul.wide.s32 z,x,y;        // 32*32 bits, creates 64 bit result\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: mul\n\n\n\nMultiply two values.\n\nSyntax\n\nmul{.rnd}{.ftz}{.sat}.f32  d, a, b;\n\nmul{.rnd}.f64              d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nCompute the product of two values.\n\nSemantics\n\nd = a * b;\n\nNotes\n\nFor floating-point multiplication, all operands must be the same size.\n\nRounding modifiers:\n\n.rn\n\nmantissa LSB rounds to nearest even\n\n.rz\n\nmantissa LSB rounds towards zero\n\n.rm\n\nmantissa LSB rounds tow...\n\n=====Half Precision Floating Point Instructions: mul\n\n\n\nMultiply two values.\n\nSyntax\n\nmul{.rnd}{.ftz}{.sat}.f16   d, a, b;\n\nmul{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nmul{.rnd}.bf16   d, a, b;\n\nmul{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n\nDescription\n\nPerforms multiplication and writes the resulting value into a destination register.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vectors by half word values from source\n\noperands. Half-word operands are then mul...\n\n=====Integer Arithmetic Instructions: mul\n\n\n\nMultiply two values.\n\nSyntax\n\nmul.mode.type  d, a, b;\n\n.mode = { .hi, .lo, .wide };\n\n.type = { .u16, .u32, .u64,\n\n          .s16, .s32, .s64 };\n\nDescription\n\nCompute the product of two values.\n\nSemantics\n\nt = a * b;\n\nn = bitwidth of type;\n\nd = t;            // for .wide\n\nd = t<2n-1..n>;   // for .hi variant\n\nd = t<n-1..0>;    // for .lo variant\n\nNotes\n\nThe type of the operation represents the types of the a and ... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul"
            };

        case "mul24":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mul24\" target=\"_blank\" rel=\"noopener noreferrer\">mul24(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: mul24</h1><section id=\"integer-arithmetic-instructions-mul24\">\n\n\n<p>Multiply two 24-bit integer values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mul24.mode.type  d, a, b;\n\n.mode = { .hi, .lo };\n.type = { .u32, .s32 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the product of two 24-bit integer values held in 32-bit source registers, and return either\nthe high or low 32-bits of the 48-bit result.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>t = a * b;\nd = t&lt;47..16&gt;;    // for .hi variant\nd = t&lt;31..0&gt;;     // for .lo variant\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Integer multiplication yields a result that is twice the size of the input operands, i.e., 48-bits.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul24.hi</span></code> performs a 24x24-bit multiply and returns the high 32 bits of the 48-bit result.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul24.lo</span></code> performs a 24x24-bit multiply and returns the low 32 bits of the 48-bit result.</p>\n<p>All operands are of the same type and size.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul24.hi</span></code> may be less efficient on machines without hardware support for 24-bit multiply.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mul24.lo.s32 d,a,b;   // low 32-bits of 24x24-bit signed multiply.\n</pre></div>\n</div>\n</section>",
                "tooltip": "Multiply two 24-bit integer values.\n\nSyntax\n\nmul24.mode.type  d, a, b;\n\n.mode = { .hi, .lo };\n\n.type = { .u32, .s32 };\n\nDescription\n\nCompute the product of two 24-bit integer values held in 32-bit source registers, and return either\n\nthe high or low 32-bits of the 48-bit result.\n\nSemantics\n\nt = a * b;\n\nd = t<47..16>;    // for .hi variant\n\nd = t<31..0>;     // for .lo variant\n\nNotes\n\nInteger multiplication yields a result that is twice the size of the input operands, i.e., 48-bits.\n\nmul24.hi performs a 24x24-bit multiply and returns the high 32 bits of the 48-bit result.\n\nmul24.lo performs a 24x24-bit multiply and returns the low 32 bits of the 48-bit result.\n\nAll operands are of the same type and size.\n\nmul24.hi may be less efficient on machines without hardware support for 24-bit multiply.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmul24.lo.s32 d,a,b;   // low 32-bits of 24x24-bit signed multiply.\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mul24"
            };

        case "multimem":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red\" target=\"_blank\" rel=\"noopener noreferrer\">multimem.ld_reduce <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red\" target=\"_blank\" rel=\"noopener noreferrer\">multimem.red <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red\" target=\"_blank\" rel=\"noopener noreferrer\">multimem.st <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red</h1><section id=\"data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red\">\n<span id=\"data-movement-and-conversion-instructions-multimem\"></span>\n\n<p>Multimem addresses can only be accessed only by multimem.* operations. Accessing a multimem address\nwith <code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code> or any other memory operations results in undefined behavior.</p>\n<p>Refer to <em>CUDA programming guide</em> for creation and management of the multimem addresses.</p>\n<p><strong>multimem.ld_reduce, multimem.st, multimem.red</strong></p>\n<p>Perform memory operations on the multimem address.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Integer type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op.type      d, [a];\nmultimem.st{.stsem}{.scope}{.ss}.type                [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.op.type           [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add, .and, .or, .xor }\n.type =     { .b32, .b64,  .u32, .u64, .s32, .s64 }\n\n// Floating point type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op{.acc_prec}{.vec}.type    d, [a];\nmultimem.st{.stsem}{.scope}{.ss}{.vec}.type                         [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.redop{.vec}.type                 [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add }\n.redop  =   { .add }\n.acc_prec = { .acc::f32 }\n.vec =      { .v2, .v4, .v8 }\n.type=      { .f16, .f16x2, .bf16, .bf16x2, .f32, .f64 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> performs the following operations:</p>\n<ul class=\"simple\">\n<li><p>load operation on the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, which involves loading of data from all of the\nmultiple memory locations pointed to by the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>,</p></li>\n<li><p>reduction operation specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code> on the multiple data loaded from the multimem address\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p></li>\n</ul>\n<p>The result of the reduction operation in returned in register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> performs a store operation of the input operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to all the memory\nlocations pointed to by the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> performs a reduction operation on all the memory locations pointed to\nby the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, with operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> performs reduction on the values loaded from all the memory\nlocations that the multimem address points to. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> perform reduction\non all the memory locations that the multimem address points to.</p>\n<p>Address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> must be a multimem address. Otherwise, the behavior is undefined.  Supported\naddressing modes for operand a and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> does not fall within the address window of <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state\nspace then the behavior is undefined.</p>\n<p>For floating-point type multi- operations, the size of the specified type along with <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> must\nequal either 32-bits or 64-bits or 128-bits. No other combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> and type are\nallowed. Type <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> cannot be used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> qualifier.</p>\n<p>The following table describes the valid combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code> and base type:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 38%\"/>\n<col style=\"width: 62%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>op</p></th>\n<th class=\"head\"><p>Base type</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n<td>\n<div class=\"line-block\">\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>\n</div>\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>\n</div>\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code>\n</div>\n</div>\n</td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code></p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td>\n<div class=\"line-block\">\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code>\n</div>\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>\n</div>\n</div>\n</td>\n</tr>\n</tbody>\n</table>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code>, the default precision of the intermediate accumulation is same as the\nspecified type. Optionally for <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> types, <code class=\"docutils literal notranslate\"><span class=\"pre\">.acc::f32</span></code>\ncan be specified to change the precision of the intermediate accumulation to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>.</p>\n<p>Optional qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.ldsem</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.stsem</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.redsem</span></code> specify the memory synchronizing effect\nof the <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> respectively, as described in\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If explicit semantics qualifiers\nare not specified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> default to <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier specifies the set of threads that can directly observe the memory\nsynchronizing effect of this operation, as described in <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified for\n<code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> then <code class=\"docutils literal notranslate\"><span class=\"pre\">.sys</span></code> scope is assumed by default.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.1.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.acc::f32</span></code> qualifier introduced in PTX ISA version 8.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>multimem.ld_reduce.and.b32                    val1_b32, [addr1];\nmultimem.ld_reduce.acquire.gpu.global.add.u32 val2_u32, [addr2];\n\nmultimem.st.relaxed.gpu.b32                [addr3], val3_b32;\nmultimem.st.release.cta.global.u32         [addr4], val4_u32;\n\nmultimem.red.relaxed.gpu.max.f64           [addr5], val5_f64;\nmultimem.red.release.cta.global.add.v4.f32 [addr6], {val6, val7, val8, val9};\nmultimem.ld_reduce.add.acc::f32.v2.f16x2   {val_10, val_11}, [addr7];\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red</h1><section id=\"data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red\">\n<span id=\"data-movement-and-conversion-instructions-multimem\"></span>\n\n<p>Multimem addresses can only be accessed only by multimem.* operations. Accessing a multimem address\nwith <code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code> or any other memory operations results in undefined behavior.</p>\n<p>Refer to <em>CUDA programming guide</em> for creation and management of the multimem addresses.</p>\n<p><strong>multimem.ld_reduce, multimem.st, multimem.red</strong></p>\n<p>Perform memory operations on the multimem address.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Integer type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op.type      d, [a];\nmultimem.st{.stsem}{.scope}{.ss}.type                [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.op.type           [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add, .and, .or, .xor }\n.type =     { .b32, .b64,  .u32, .u64, .s32, .s64 }\n\n// Floating point type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op{.acc_prec}{.vec}.type    d, [a];\nmultimem.st{.stsem}{.scope}{.ss}{.vec}.type                         [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.redop{.vec}.type                 [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add }\n.redop  =   { .add }\n.acc_prec = { .acc::f32 }\n.vec =      { .v2, .v4, .v8 }\n.type=      { .f16, .f16x2, .bf16, .bf16x2, .f32, .f64 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> performs the following operations:</p>\n<ul class=\"simple\">\n<li><p>load operation on the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, which involves loading of data from all of the\nmultiple memory locations pointed to by the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>,</p></li>\n<li><p>reduction operation specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code> on the multiple data loaded from the multimem address\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p></li>\n</ul>\n<p>The result of the reduction operation in returned in register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> performs a store operation of the input operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to all the memory\nlocations pointed to by the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> performs a reduction operation on all the memory locations pointed to\nby the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, with operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> performs reduction on the values loaded from all the memory\nlocations that the multimem address points to. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> perform reduction\non all the memory locations that the multimem address points to.</p>\n<p>Address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> must be a multimem address. Otherwise, the behavior is undefined.  Supported\naddressing modes for operand a and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> does not fall within the address window of <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state\nspace then the behavior is undefined.</p>\n<p>For floating-point type multi- operations, the size of the specified type along with <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> must\nequal either 32-bits or 64-bits or 128-bits. No other combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> and type are\nallowed. Type <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> cannot be used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> qualifier.</p>\n<p>The following table describes the valid combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code> and base type:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 38%\"/>\n<col style=\"width: 62%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>op</p></th>\n<th class=\"head\"><p>Base type</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n<td>\n<div class=\"line-block\">\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>\n</div>\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>\n</div>\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code>\n</div>\n</div>\n</td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code></p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td>\n<div class=\"line-block\">\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code>\n</div>\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>\n</div>\n</div>\n</td>\n</tr>\n</tbody>\n</table>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code>, the default precision of the intermediate accumulation is same as the\nspecified type. Optionally for <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> types, <code class=\"docutils literal notranslate\"><span class=\"pre\">.acc::f32</span></code>\ncan be specified to change the precision of the intermediate accumulation to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>.</p>\n<p>Optional qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.ldsem</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.stsem</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.redsem</span></code> specify the memory synchronizing effect\nof the <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> respectively, as described in\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If explicit semantics qualifiers\nare not specified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> default to <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier specifies the set of threads that can directly observe the memory\nsynchronizing effect of this operation, as described in <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified for\n<code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> then <code class=\"docutils literal notranslate\"><span class=\"pre\">.sys</span></code> scope is assumed by default.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.1.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.acc::f32</span></code> qualifier introduced in PTX ISA version 8.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>multimem.ld_reduce.and.b32                    val1_b32, [addr1];\nmultimem.ld_reduce.acquire.gpu.global.add.u32 val2_u32, [addr2];\n\nmultimem.st.relaxed.gpu.b32                [addr3], val3_b32;\nmultimem.st.release.cta.global.u32         [addr4], val4_u32;\n\nmultimem.red.relaxed.gpu.max.f64           [addr5], val5_f64;\nmultimem.red.release.cta.global.add.v4.f32 [addr6], {val6, val7, val8, val9};\nmultimem.ld_reduce.add.acc::f32.v2.f16x2   {val_10, val_11}, [addr7];\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red</h1><section id=\"data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red\">\n<span id=\"data-movement-and-conversion-instructions-multimem\"></span>\n\n<p>Multimem addresses can only be accessed only by multimem.* operations. Accessing a multimem address\nwith <code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code> or any other memory operations results in undefined behavior.</p>\n<p>Refer to <em>CUDA programming guide</em> for creation and management of the multimem addresses.</p>\n<p><strong>multimem.ld_reduce, multimem.st, multimem.red</strong></p>\n<p>Perform memory operations on the multimem address.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// Integer type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op.type      d, [a];\nmultimem.st{.stsem}{.scope}{.ss}.type                [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.op.type           [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add, .and, .or, .xor }\n.type =     { .b32, .b64,  .u32, .u64, .s32, .s64 }\n\n// Floating point type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op{.acc_prec}{.vec}.type    d, [a];\nmultimem.st{.stsem}{.scope}{.ss}{.vec}.type                         [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.redop{.vec}.type                 [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add }\n.redop  =   { .add }\n.acc_prec = { .acc::f32 }\n.vec =      { .v2, .v4, .v8 }\n.type=      { .f16, .f16x2, .bf16, .bf16x2, .f32, .f64 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> performs the following operations:</p>\n<ul class=\"simple\">\n<li><p>load operation on the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, which involves loading of data from all of the\nmultiple memory locations pointed to by the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>,</p></li>\n<li><p>reduction operation specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code> on the multiple data loaded from the multimem address\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p></li>\n</ul>\n<p>The result of the reduction operation in returned in register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> performs a store operation of the input operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to all the memory\nlocations pointed to by the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> performs a reduction operation on all the memory locations pointed to\nby the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, with operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> performs reduction on the values loaded from all the memory\nlocations that the multimem address points to. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> perform reduction\non all the memory locations that the multimem address points to.</p>\n<p>Address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> must be a multimem address. Otherwise, the behavior is undefined.  Supported\naddressing modes for operand a and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> does not fall within the address window of <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state\nspace then the behavior is undefined.</p>\n<p>For floating-point type multi- operations, the size of the specified type along with <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> must\nequal either 32-bits or 64-bits or 128-bits. No other combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> and type are\nallowed. Type <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> cannot be used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> qualifier.</p>\n<p>The following table describes the valid combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code> and base type:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 38%\"/>\n<col style=\"width: 62%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>op</p></th>\n<th class=\"head\"><p>Base type</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n<td>\n<div class=\"line-block\">\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>\n</div>\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>\n</div>\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code>\n</div>\n</div>\n</td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code></p></td>\n</tr>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td>\n<div class=\"line-block\">\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code>\n</div>\n<div class=\"line\">\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>\n</div>\n</div>\n</td>\n</tr>\n</tbody>\n</table>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code>, the default precision of the intermediate accumulation is same as the\nspecified type. Optionally for <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> types, <code class=\"docutils literal notranslate\"><span class=\"pre\">.acc::f32</span></code>\ncan be specified to change the precision of the intermediate accumulation to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>.</p>\n<p>Optional qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.ldsem</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.stsem</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.redsem</span></code> specify the memory synchronizing effect\nof the <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> respectively, as described in\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If explicit semantics qualifiers\nare not specified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> default to <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier specifies the set of threads that can directly observe the memory\nsynchronizing effect of this operation, as described in <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified for\n<code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> then <code class=\"docutils literal notranslate\"><span class=\"pre\">.sys</span></code> scope is assumed by default.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.1.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.acc::f32</span></code> qualifier introduced in PTX ISA version 8.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>multimem.ld_reduce.and.b32                    val1_b32, [addr1];\nmultimem.ld_reduce.acquire.gpu.global.add.u32 val2_u32, [addr2];\n\nmultimem.st.relaxed.gpu.b32                [addr3], val3_b32;\nmultimem.st.release.cta.global.u32         [addr4], val4_u32;\n\nmultimem.red.relaxed.gpu.max.f64           [addr5], val5_f64;\nmultimem.red.release.cta.global.add.v4.f32 [addr6], {val6, val7, val8, val9};\nmultimem.ld_reduce.add.acc::f32.v2.f16x2   {val_10, val_11}, [addr7];\n</pre></div>\n</div>\n</section>",
                "tooltip": "locations which the multimem address points to.\n\nMultimem addresses can only be accessed only by multimem.* operations. Accessing a multimem address\n\nwith ld, st or any other memory operations results in undefined behavior.\n\nRefer to CUDA programming guide for creation and management of the multimem addresses.\n\nmultimem.ld_reduce, multimem.st, multimem.red\n\nPerform memory operations on the multimem address.\n\nSyntax\n\n// Integer type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op.type      d, [a];\n\nmultimem.st{.stsem}{.scope}{.ss}.type                [a], b;\n\nmultimem.red{.redsem}{.scope}{.ss}.op.type           [a], b;\n\n.ss =       { .global }\n\n.ldsem =    { .weak, .relaxed, .acquire }\n\n.stsem =    { .weak, .relaxed, .release }\n\n.redsem =   { .relaxed, .release }\n\n.scope =    { .cta, .cluster, .gpu, .sys }\n\n.op  =      { .min, .max, .add, .and, .or, .xor }\n\n.type =     { .b32, .b64,  .u32, .u64, .s32, .s64 }\n\n// Floating point type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op{.acc_prec}{.vec}.type    d, [a];\n\nmultimem.st{.stsem}{.scope}{.ss}{.vec}.type                         [a], b;\n\nmultimem.red{.redsem}{.scope}{.ss}.redop{.vec}.type                 [a], b;\n\n.ss =       { .global }\n\n.ldsem =    { .weak, .relaxed, .acquire }\n\n.stsem =    { .weak, .relaxed, .release }\n\n.redsem =   { .relaxed, .release }\n\n.scope =    { .cta, .cluster, .gpu, .sys }\n\n.op  =      { .min, .max, .add }\n\n.redop  =   { .add }\n\n.acc_prec = { .acc::f32 }\n\n.vec =      { .v2, .v4, .v8 }\n\n.type=      { .f16, .f16x2, .bf16, .bf16x2, .f32, .f64 }\n\nDescription\n\nInstruction multimem.ld_reduce performs the following operations:\n\nload operation on the multimem address a, which involves loading of data from all of the\n\nmultiple memory locations pointed to by the multimem address a,\n\nreduction operation specified by .op on the multiple data loaded from the multimem address\n\na.\n\nThe result of the reduction operation in returned in register d.\n\nInstruction multimem.st performs a store operation of the input operand b to all the memory\n\nlocations pointed to by the multimem address a.\n\nInstruction multimem.red performs a reduction operation on all the memory locations pointed to\n\nby the multimem address a, with operand b.\n\nInstruction multimem.ld_reduce performs reduction on the values loaded from all the memory\n\nlocations that the multimem address points to. In contrast, the multimem.red perform reduction\n\non all the memory locations that the multimem address points to.\n\nAddress operand a must be a multimem address. Otherwise, the behavior is undefined.  Supported\n\naddressing modes for operand a and alignment requirements are described in Addresses as Operands.\n\nIf no state space is specified then Generic Addressing is\n\nused. If the address specified by a does not fall within the address window of .global state\n\nspace then the behavior is undefined.\n\nFor floating-point type multi- operations, the size of the specified type along with .vec must\n\nequal either 32-bits or 64-bits or 128-bits. No other combinations of .vec and type are\n\nallowed. Type .f64 cannot be used with .vec qualifier.\n\nThe following table describes the valid combinations of .op and base type:\n\n\n\nop\n\nBase type\n\n\n\n.add\n\n.u32, .u64, .s32\n\n.f16, .f16x2, .bf16, .bf16x2\n\n.f32, .f64\n\n\n\n.and, .or, .xor\n\n.b32, .b64\n\n.min, .max\n\n.u32, .s32, .u64, .s64\n\n.f16, .f16x2, .bf16, .bf16x2\n\n\n\nFor multimem.ld_reduce, the default precision of the intermediate accumulation is same as the\n\nspecified type. Optionally for .f16, .f16x2, .bf16 and .bf16x2 types, .acc::f32\n\ncan be specified to change the precision of the intermediate accumulation to .f32.\n\nOptional qualifiers .ldsem, .stsem and .redsem specify the memory synchronizing effect\n\nof the multimem.ld_reduce, multimem.st and multimem.red respectively, as described in\n\nMemory Consistency Model. If explicit semantics qualifiers\n\nare not specified, then multimem.ld_reduce and multimem.st default to .weak and\n\nmultimem.red defaults to .relaxed.\n\nThe optional .scope qualifier specifies the set of threads that can directly observe the memory\n\nsynchronizi ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red"
            };

        case "nanosleep":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep\" target=\"_blank\" rel=\"noopener noreferrer\">nanosleep <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Miscellaneous Instructions: nanosleep</h1><section id=\"miscellaneous-instructions-nanosleep\">\n\n\n<p>Suspend the thread for an approximate delay given in nanoseconds.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>nanosleep.u32 t;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Suspends the thread for a sleep duration approximately close to the delay <code class=\"docutils literal notranslate\"><span class=\"pre\">t</span></code>, specified in\nnanoseconds. <code class=\"docutils literal notranslate\"><span class=\"pre\">t</span></code> may be a register or an immediate value.</p>\n<p>The sleep duration is approximated, but guaranteed to be in the interval <code class=\"docutils literal notranslate\"><span class=\"pre\">[0,</span> <span class=\"pre\">2*t]</span></code>. The maximum\nsleep duration is 1 millisecond. The implementation may reduce the sleep duration for individual\nthreads within a warp such that all sleeping threads in the warp wake up together.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">nanosleep</span></code> introduced in PTX ISA 6.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">nanosleep</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b32 r;\n.reg .pred p;\n\nnanosleep.u32 r;\nnanosleep.u32 42;\n@p nanosleep.u32 r;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Suspend the thread for an approximate delay given in nanoseconds.\n\nSyntax\n\nnanosleep.u32 t;\n\nDescription\n\nSuspends the thread for a sleep duration approximately close to the delay t, specified in\n\nnanoseconds. t may be a register or an immediate value.\n\nThe sleep duration is approximated, but guaranteed to be in the interval [0, 2*t]. The maximum\n\nsleep duration is 1 millisecond. The implementation may reduce the sleep duration for individual\n\nthreads within a warp such that all sleeping threads in the warp wake up together.\n\nPTX ISA Notes\n\nnanosleep introduced in PTX ISA 6.3.\n\nTarget ISA Notes\n\nnanosleep requires sm_70 or higher.\n\nExamples\n\n.reg .b32 r;\n\n.reg .pred p;\n\nnanosleep.u32 r;\n\nnanosleep.u32 42;\n\n@p nanosleep.u32 r;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep"
            };

        case "nclusterid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nclusterid\" target=\"_blank\" rel=\"noopener noreferrer\">nclusterid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %nclusterid</h1><section id=\"special-registers-nclusterid\">\n\n\n<p>Number of cluster identifiers per grid.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %nclusterid;\n.sreg .u32 %nclusterid.x, %nclusterid.y, %nclusterid.z;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the number of clusters in each grid\ndimension.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">%nclusterid</span></code> special register contains a 3D grid shape vector that holds the grid dimensions\nin terms of clusters. The fourth element is unused and always returns zero.</p>\n<p>Refer to the <em>Cuda Programming Guide</em> for details on the maximum values of <code class=\"docutils literal notranslate\"><span class=\"pre\">%nclusterid.{x,y,z}</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.reg .b32 %r&lt;2&gt;;\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %nclusterid.x;\nmov.u32     %r1, %nclusterid.z;\nmov.v4.u32  %rx, %nclusterid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of cluster identifiers per grid.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %nclusterid;\n\n.sreg .u32 %nclusterid.x, %nclusterid.y, %nclusterid.z;\n\nDescription\n\nA predefined, read-only special register initialized with the number of clusters in each grid\n\ndimension.\n\nThe %nclusterid special register contains a 3D grid shape vector that holds the grid dimensions\n\nin terms of clusters. The fourth element is unused and always returns zero.\n\nRefer to the Cuda Programming Guide for details on the maximum values of %nclusterid.{x,y,z}.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r<2>;\n\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %nclusterid.x;\n\nmov.u32     %r1, %nclusterid.z;\n\nmov.v4.u32  %rx, %nclusterid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nclusterid"
            };

        case "nctaid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nctaid\" target=\"_blank\" rel=\"noopener noreferrer\">nctaid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %nctaid</h1><section id=\"special-registers-nctaid\">\n\n\n<p>Number of CTA ids per grid.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %nctaid                      // Grid shape vector\n.sreg .u32 %nctaid.x,%nctaid.y,%nctaid.z;   // Grid dimensions\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the number of CTAs in each grid\ndimension. The <code class=\"docutils literal notranslate\"><span class=\"pre\">%nctaid</span></code> special register contains a 3D grid shape vector, with each element\nhaving a value of at least <code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code>. The fourth element is unused and always returns zero.</p>\n<p>Maximum values of %nctaid.{x,y,z} are as follows:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 54%\"/>\n<col style=\"width: 20%\"/>\n<col style=\"width: 13%\"/>\n<col style=\"width: 13%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>.target architecture</p></th>\n<th class=\"head\"><p>%nctaid.x</p></th>\n<th class=\"head\"><p>%nctaid.y</p></th>\n<th class=\"head\"><p>%nctaid.z</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code></p></td>\n<td><p>65535</p></td>\n<td><p>65535</p></td>\n<td><p>65535</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_3x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_5x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_7x</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_8x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_9x</span></code></p></td>\n<td><p>2<sup>31</sup> -1</p></td>\n<td><p>65535</p></td>\n<td><p>65535</p></td>\n</tr>\n</tbody>\n</table>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0 with type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u16</span></code>.</p>\n<p>Redefined as type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u32</span></code> in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instructions may be used to read the lower 16-bits of each component of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%nctaid</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32  %r0,%nctaid.x;\nmov.u16  %rh,%nctaid.x;     // legacy code\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of CTA ids per grid.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %nctaid                      // Grid shape vector\n\n.sreg .u32 %nctaid.x,%nctaid.y,%nctaid.z;   // Grid dimensions\n\nDescription\n\nA predefined, read-only special register initialized with the number of CTAs in each grid\n\ndimension. The %nctaid special register contains a 3D grid shape vector, with each element\n\nhaving a value of at least 1. The fourth element is unused and always returns zero.\n\nMaximum values of %nctaid.{x,y,z} are as follows:\n\n\n\n\n\n.target architecture\n\n%nctaid.x\n\n%nctaid.y\n\n%nctaid.z\n\n\n\nsm_1x, sm_20\n\n65535\n\n65535\n\n65535\n\nsm_3x, sm_5x, sm_6x, sm_7x,\n\nsm_8x, sm_9x\n\n231 -1\n\n65535\n\n65535\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 with type .v4.u16.\n\nRedefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n\n%nctaid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32  %r0,%nctaid.x;\n\nmov.u16  %rh,%nctaid.x;     // legacy code\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nctaid"
            };

        case "neg":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg\" target=\"_blank\" rel=\"noopener noreferrer\">neg(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg\" target=\"_blank\" rel=\"noopener noreferrer\">neg(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg\" target=\"_blank\" rel=\"noopener noreferrer\">neg(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: neg</h1><section id=\"floating-point-instructions-neg\">\n\n\n<p>Arithmetic negate.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>neg{.ftz}.f32  d, a;\nneg.f64        d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Negate the sign of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and store the result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = -a;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt>\n<dd>\n<p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt>\n<dd>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> inputs yield an unspecified <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>. Future implementations may comply with the IEEE 754\nstandard by preserving payload and modifying only the sign bit.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>neg.ftz.f32  x,f0;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: neg</h1><section id=\"half-precision-floating-point-instructions-neg\">\n\n\n<p>Arithmetic negate.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>neg{.ftz}.f16    d, a;\nneg{.ftz}.f16x2  d, a;\nneg.bf16         d, a;\nneg.bf16x2       d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Negate the sign of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and store the result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, forms input vector by extracting half word values\nfrom the source operand. Half-word operands are then negated in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> result in destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> instruction\ntype, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>\nand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    d = -a;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = -fA[i];\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<dl class=\"simple\">\n<dt>Subnormal numbers:</dt>\n<dd>\n<p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">neg.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> inputs yield an unspecified <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>. Future implementations may comply with the IEEE 754\nstandard by preserving payload and modifying only the sign bit.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">neg.bf16x2</span></code> introduced in PTX ISA 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">neg.bf16x2</span></code> requires architecture <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>neg.ftz.f16  x,f0;\nneg.bf16     x,b0;\nneg.bf16x2   x1,b1;\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: neg</h1><section id=\"integer-arithmetic-instructions-neg\">\n\n\n<p>Arithmetic negate.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>neg.type  d, a;\n\n.type = { .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Negate the sign of <strong>a</strong> and store the result in <strong>d</strong>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = -a;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Only for signed integers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>neg.s32  r0,a;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: neg\n\n\n\nArithmetic negate.\n\nSyntax\n\nneg{.ftz}.f32  d, a;\n\nneg.f64        d, a;\n\nDescription\n\nNegate the sign of a and store the result in d.\n\nSemantics\n\nd = -a;\n\nNotes\n\nSubnormal numbers:\n\nsm_20+\n\nBy default, subnormal numbers are supported.\n\nneg.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1x\n\nneg.f64 supports subnormal numbers.\n\nneg.f32 flushes subnormal inputs and results to sign-preservi...\n\n=====Half Precision Floating Point Instructions: neg\n\n\n\nArithmetic negate.\n\nSyntax\n\nneg{.ftz}.f16    d, a;\n\nneg{.ftz}.f16x2  d, a;\n\nneg.bf16         d, a;\n\nneg.bf16x2       d, a;\n\nDescription\n\nNegate the sign of a and store the result in d.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vector by extracting half word values\n\nfrom the source operand. Half-word operands are then negated in parallel to produce .f16x2 or\n\n.bf16x2 result in destination.\n\nFor .f...\n\n=====Integer Arithmetic Instructions: neg\n\n\n\nArithmetic negate.\n\nSyntax\n\nneg.type  d, a;\n\n.type = { .s16, .s32, .s64 };\n\nDescription\n\nNegate the sign of a and store the result in d.\n\nSemantics\n\nd = -a;\n\nNotes\n\nOnly for signed integers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nneg.s32  r0,a;\n\n... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg"
            };

        case "noreturn":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-noreturn\" target=\"_blank\" rel=\"noopener noreferrer\">noreturn <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Performance-Tuning Directives: .noreturn</h1><section id=\"performance-tuning-directives-noreturn\">\n\n\n<p>Indicate that the function does not return to its caller function.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.noreturn\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Indicate that the function does not return to its caller function.</p>\n<p><strong>Semantics</strong></p>\n<p>An optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive indicates that the function does not return to caller\nfunction. <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive can only be specified on device functions and must appear between\na <code class=\"docutils literal notranslate\"><span class=\"pre\">.func</span></code> directive and its body.</p>\n<p>The directive cannot be specified on functions which have return parameters.</p>\n<p>If a function with <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive returns to the caller function at runtime, then the\nbehavior is undefined.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.4.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.func foo .noreturn { ... }\n</pre></div>\n</div>\n</section>",
                "tooltip": "Indicate that the function does not return to its caller function.\n\nSyntax\n\n.noreturn\n\nDescription\n\nIndicate that the function does not return to its caller function.\n\nSemantics\n\nAn optional .noreturn directive indicates that the function does not return to caller\n\nfunction. .noreturn directive can only be specified on device functions and must appear between\n\na .func directive and its body.\n\nThe directive cannot be specified on functions which have return parameters.\n\nIf a function with .noreturn directive returns to the caller function at runtime, then the\n\nbehavior is undefined.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.4.\n\nTarget ISA Notes\n\nRequires sm_30 or higher.\n\nExamples\n\n.func foo .noreturn { ... }\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-noreturn"
            };

        case "not":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not\" target=\"_blank\" rel=\"noopener noreferrer\">not <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: not</h1><section id=\"logic-and-shift-instructions-not\">\n\n\n<p>Bitwise negation; one\u2019s complement.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>not.type d, a;\n\n.type = { .pred, .b16, .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Invert the bits in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = ~a;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The size of the operands must match, but not necessarily the type.</p>\n<p>Allowed types include predicates.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>not.b32  mask,mask;\nnot.pred  p,q;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Bitwise negation; one\u2019s complement.\n\nSyntax\n\nnot.type d, a;\n\n.type = { .pred, .b16, .b32, .b64 };\n\nDescription\n\nInvert the bits in a.\n\nSemantics\n\nd = ~a;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nAllowed types include predicates.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nnot.b32  mask,mask;\n\nnot.pred  p,q;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not"
            };

        case "nsmid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nsmid\" target=\"_blank\" rel=\"noopener noreferrer\">nsmid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %nsmid</h1><section id=\"special-registers-nsmid\">\n\n\n<p>Number of SM identifiers.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %nsmid;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register that returns the maximum number of SM identifiers. The SM\nidentifier numbering is not guaranteed to be contiguous, so <code class=\"docutils literal notranslate\"><span class=\"pre\">%nsmid</span></code> may be larger than the\nphysical number of SMs in the device.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%nsmid</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32  %r, %nsmid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of SM identifiers.\n\nSyntax (predefined)\n\n.sreg .u32 %nsmid;\n\nDescription\n\nA predefined, read-only special register that returns the maximum number of SM identifiers. The SM\n\nidentifier numbering is not guaranteed to be contiguous, so %nsmid may be larger than the\n\nphysical number of SMs in the device.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%nsmid requires sm_20 or higher.\n\nExamples\n\nmov.u32  %r, %nsmid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nsmid"
            };

        case "ntid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-ntid\" target=\"_blank\" rel=\"noopener noreferrer\">ntid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %ntid</h1><section id=\"special-registers-ntid\">\n\n\n<p>Number of thread IDs per CTA.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %ntid;                   // CTA shape vector\n.sreg .u32 %ntid.x, %ntid.y, %ntid.z;   // CTA dimensions\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the number of thread ids in each CTA\ndimension. The <code class=\"docutils literal notranslate\"><span class=\"pre\">%ntid</span></code> special register contains a 3D CTA shape vector that holds the CTA\ndimensions. CTA dimensions are non-zero; the fourth element is unused and always returns zero. The\ntotal number of threads in a CTA is <code class=\"docutils literal notranslate\"><span class=\"pre\">(%ntid.x</span> <span class=\"pre\">*</span> <span class=\"pre\">%ntid.y</span> <span class=\"pre\">*</span> <span class=\"pre\">%ntid.z)</span></code>.</p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>%ntid.y == %ntid.z == 1 in 1D CTAs.\n%ntid.z ==1 in 2D CTAs.\n</pre></div>\n</div>\n<p>Maximum values of %ntid.{x,y,z} are as follows:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 63%\"/>\n<col style=\"width: 12%\"/>\n<col style=\"width: 12%\"/>\n<col style=\"width: 12%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\">\n<th class=\"head\"><p>.target architecture</p></th>\n<th class=\"head\"><p>%ntid.x</p></th>\n<th class=\"head\"><p>%ntid.y</p></th>\n<th class=\"head\"><p>%ntid.z</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></p></td>\n<td><p>512</p></td>\n<td><p>512</p></td>\n<td><p>64</p></td>\n</tr>\n<tr class=\"row-odd\">\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_3x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_5x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_7x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_8x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_9x</span></code></p></td>\n<td><p>1024</p></td>\n<td><p>1024</p></td>\n<td><p>64</p></td>\n</tr>\n</tbody>\n</table>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0 with type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u16</span></code>.</p>\n<p>Redefined as type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u32</span></code> in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instructions may be used to read the lower 16-bits of each component of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%ntid</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>// compute unified thread id for 2D CTA\nmov.u32  %r0,%tid.x;\nmov.u32  %h1,%tid.y;\nmov.u32  %h2,%ntid.x;\nmad.u32  %r0,%h1,%h2,%r0;\n\nmov.u16  %rh,%ntid.x;      // legacy code\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of thread IDs per CTA.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %ntid;                   // CTA shape vector\n\n.sreg .u32 %ntid.x, %ntid.y, %ntid.z;   // CTA dimensions\n\nDescription\n\nA predefined, read-only special register initialized with the number of thread ids in each CTA\n\ndimension. The %ntid special register contains a 3D CTA shape vector that holds the CTA\n\ndimensions. CTA dimensions are non-zero; the fourth element is unused and always returns zero. The\n\ntotal number of threads in a CTA is (%ntid.x * %ntid.y * %ntid.z).\n\n%ntid.y == %ntid.z == 1 in 1D CTAs.\n\n%ntid.z ==1 in 2D CTAs.\n\nMaximum values of %ntid.{x,y,z} are as follows:\n\n\n\n\n\n.target architecture\n\n%ntid.x\n\n%ntid.y\n\n%ntid.z\n\n\n\nsm_1x\n\n512\n\n512\n\n64\n\nsm_20, sm_3x, sm_5x, sm_6x,\n\nsm_7x, sm_8x, sm_9x\n\n1024\n\n1024\n\n64\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 with type .v4.u16.\n\nRedefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n\n%ntid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n// compute unified thread id for 2D CTA\n\nmov.u32  %r0,%tid.x;\n\nmov.u32  %h1,%tid.y;\n\nmov.u32  %h2,%ntid.x;\n\nmad.u32  %r0,%h1,%h2,%r0;\n\nmov.u16  %rh,%ntid.x;      // legacy code\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-ntid"
            };

        case "nwarpid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nwarpid\" target=\"_blank\" rel=\"noopener noreferrer\">nwarpid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %nwarpid</h1><section id=\"special-registers-nwarpid\">\n\n\n<p>Number of warp identifiers.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %nwarpid;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register that returns the maximum number of warp identifiers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%nwarpid</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32  %r, %nwarpid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of warp identifiers.\n\nSyntax (predefined)\n\n.sreg .u32 %nwarpid;\n\nDescription\n\nA predefined, read-only special register that returns the maximum number of warp identifiers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%nwarpid requires sm_20 or higher.\n\nExamples\n\nmov.u32  %r, %nwarpid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nwarpid"
            };

        case "or":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or\" target=\"_blank\" rel=\"noopener noreferrer\">or <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: or</h1><section id=\"logic-and-shift-instructions-or\">\n\n\n<p>Biwise OR.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>or.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the bit-wise or operation for the bits in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>d = a | b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The size of the operands must match, but not necessarily the type.</p>\n<p>Allowed types include predicate registers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>or.b32  mask mask,0x00010001\nor.pred  p,q,r;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Biwise OR.\n\nSyntax\n\nor.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n\nDescription\n\nCompute the bit-wise or operation for the bits in a and b.\n\nSemantics\n\nd = a | b;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nAllowed types include predicate registers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nor.b32  mask mask,0x00010001\n\nor.pred  p,q,r;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or"
            };

        case "pm0":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-pm0-pm7\" target=\"_blank\" rel=\"noopener noreferrer\">pm0..%pm7 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %pm0..%pm7</h1><section id=\"special-registers-pm0-pm7\">\n\n\n<p>Performance monitoring counters.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u32 %pm&lt;8&gt;;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special registers <code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0..%pm7</span></code> are unsigned 32-bit read-only performance monitor counters. Their\nbehavior is currently undefined.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0..%pm3</span></code> introduced in PTX ISA version 1.3.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%pm4..%pm7</span></code> introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0..%pm3</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%pm4..%pm7</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32  r1,%pm0;\nmov.u32  r1,%pm7;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Performance monitoring counters.\n\nSyntax (predefined)\n\n.sreg .u32 %pm<8>;\n\nDescription\n\nSpecial registers %pm0..%pm7 are unsigned 32-bit read-only performance monitor counters. Their\n\nbehavior is currently undefined.\n\nPTX ISA Notes\n\n%pm0..%pm3 introduced in PTX ISA version 1.3.\n\n%pm4..%pm7 introduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\n%pm0..%pm3 supported on all target architectures.\n\n%pm4..%pm7 require sm_20 or higher.\n\nExamples\n\nmov.u32  r1,%pm0;\n\nmov.u32  r1,%pm7;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-pm0-pm7"
            };

        case "pm0_64":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-pm0-64-pm7-64\" target=\"_blank\" rel=\"noopener noreferrer\">pm0_64..%pm7_64 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %pm0_64..%pm7_64</h1><section id=\"special-registers-pm0-64-pm7-64\">\n<span id=\"id16\"></span>\n\n<p>64 bit Performance monitoring counters.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.sreg .u64 %pm0_64;\n.sreg .u64 %pm1_64;\n.sreg .u64 %pm2_64;\n.sreg .u64 %pm3_64;\n.sreg .u64 %pm4_64;\n.sreg .u64 %pm5_64;\n.sreg .u64 %pm6_64;\n.sreg .u64 %pm7_64;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special registers <code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0_64..%pm7_64</span></code> are unsigned 64-bit read-only performance monitor\ncounters. Their behavior is currently undefined.</p>\n<p><strong>Notes</strong></p>\n<p>The lower 32bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0_64..%pm7_64</span></code> are identical to <code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0..%pm7</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0_64..%pm7_64</span></code> introduced in PTX ISA version 4.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0_64..%pm7_64</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_50</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>mov.u32  r1,%pm0_64;\nmov.u32  r1,%pm7_64;\n</pre></div>\n</div>\n</section>",
                "tooltip": "64 bit Performance monitoring counters.\n\nSyntax (predefined)\n\n.sreg .u64 %pm0_64;\n\n.sreg .u64 %pm1_64;\n\n.sreg .u64 %pm2_64;\n\n.sreg .u64 %pm3_64;\n\n.sreg .u64 %pm4_64;\n\n.sreg .u64 %pm5_64;\n\n.sreg .u64 %pm6_64;\n\n.sreg .u64 %pm7_64;\n\nDescription\n\nSpecial registers %pm0_64..%pm7_64 are unsigned 64-bit read-only performance monitor\n\ncounters. Their behavior is currently undefined.\n\nNotes\n\nThe lower 32bits of %pm0_64..%pm7_64 are identical to %pm0..%pm7.\n\nPTX ISA Notes\n\n%pm0_64..%pm7_64 introduced in PTX ISA version 4.0.\n\nTarget ISA Notes\n\n%pm0_64..%pm7_64 require sm_50 or higher.\n\nExamples\n\nmov.u32  r1,%pm0_64;\n\nmov.u32  r1,%pm7_64;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-pm0-64-pm7-64"
            };

        case "pmevent":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent\" target=\"_blank\" rel=\"noopener noreferrer\">pmevent <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Miscellaneous Instructions: pmevent</h1><section id=\"miscellaneous-instructions-pmevent\">\n\n\n<p>Trigger one or more Performance Monitor events.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>pmevent       a;    // trigger a single performance monitor event\npmevent.mask  a;    // trigger one or more performance monitor events\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Triggers one or more of a fixed number of performance monitor events, with event index or mask\nspecified by immediate operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">pmevent</span></code> (without modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.mask</span></code>) triggers a single performance monitor event indexed by\nimmediate operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, in the range <code class=\"docutils literal notranslate\"><span class=\"pre\">0..15</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">pmevent.mask</span></code> triggers one or more of the performance monitor events. Each bit in the 16-bit\nimmediate operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> controls an event.</p>\n<p>Programmatic performance moniter events may be combined with other hardware events using Boolean\nfunctions to increment one of the four performance counters. The relationship between events and\ncounters is programmed via API calls from the host.</p>\n<p><strong>Notes</strong></p>\n<p>Currently, there are sixteen performance monitor events, numbered 0 through 15.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">pmevent</span></code> introduced in PTX ISA version 1.4.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">pmevent.mask</span></code> introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>pmevent supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">pmevent.mask</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>    pmevent      1;\n@p  pmevent      7;\n@q  pmevent.mask 0xff;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Trigger one or more Performance Monitor events.\n\nSyntax\n\npmevent       a;    // trigger a single performance monitor event\n\npmevent.mask  a;    // trigger one or more performance monitor events\n\nDescription\n\nTriggers one or more of a fixed number of performance monitor events, with event index or mask\n\nspecified by immediate operand a.\n\npmevent (without modifier .mask) triggers a single performance monitor event indexed by\n\nimmediate operand a, in the range 0..15.\n\npmevent.mask triggers one or more of the performance monitor events. Each bit in the 16-bit\n\nimmediate operand a controls an event.\n\nProgrammatic performance moniter events may be combined with other hardware events using Boolean\n\nfunctions to increment one of the four performance counters. The relationship between events and\n\ncounters is programmed via API calls from the host.\n\nNotes\n\nCurrently, there are sixteen performance monitor events, numbered 0 through 15.\n\nPTX ISA Notes\n\npmevent introduced in PTX ISA version 1.4.\n\npmevent.mask introduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\npmevent supported on all target architectures.\n\npmevent.mask requires sm_20 or higher.\n\nExamples\n\n    pmevent      1;\n\n@p  pmevent      7;\n\n@q  pmevent.mask 0xff;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent"
            };

        case "popc":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc\" target=\"_blank\" rel=\"noopener noreferrer\">popc(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: popc</h1><section id=\"integer-arithmetic-instructions-popc\">\n\n\n<p>Population count.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>popc.type  d, a;\n\n.type = { .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Count the number of one bits in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and place the resulting <em>population count</em> in 32-bit\ndestination register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> has the instruction type and destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has type\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.u32  d = 0;\nwhile (a != 0) {\n   if (a &amp; 0x1)  d++;\n   a = a &gt;&gt; 1;\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">popc</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>popc.b32  d, a;\npopc.b64  cnt, X;  // cnt is .u32\n</pre></div>\n</div>\n</section>",
                "tooltip": "Population count.\n\nSyntax\n\npopc.type  d, a;\n\n.type = { .b32, .b64 };\n\nDescription\n\nCount the number of one bits in a and place the resulting population count in 32-bit\n\ndestination register d. Operand a has the instruction type and destination d has type\n\n.u32.\n\nSemantics\n\n.u32  d = 0;\n\nwhile (a != 0) {\n\n   if (a & 0x1)  d++;\n\n   a = a >> 1;\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\npopc requires sm_20 or higher.\n\nExamples\n\npopc.b32  d, a;\n\npopc.b64  cnt, X;  // cnt is .u32\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc"
            };

        case "pragma":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-pragma\" target=\"_blank\" rel=\"noopener noreferrer\">pragma <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Performance-Tuning Directives: .pragma</h1><section id=\"performance-tuning-directives-pragma\">\n\n\n<p>Pass directives to PTX backend compiler.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.pragma list-of-strings ;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Pass module-scoped, entry-scoped, or statement-level directives to the PTX backend compiler.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.pragma</span></code> directive may occur at module-scope, at entry-scope, or at statement-level.</p>\n<p><strong>Semantics</strong></p>\n<p>The interpretation of <code class=\"docutils literal notranslate\"><span class=\"pre\">.pragma</span></code> directive strings is implementation-specific and has no impact on\nPTX semantics. See <a class=\"reference external\" href=\"#descriptions-of-pragma-strings\">Descriptions of .pragma Strings</a> for\ndescriptions of the pragma strings defined in <code class=\"docutils literal notranslate\"><span class=\"pre\">ptxas</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>.pragma \"nounroll\";    // disable unrolling in backend\n\n// disable unrolling for current kernel\n.entry foo .pragma \"nounroll\"; { ... }\n</pre></div>\n</div>\n</section>",
                "tooltip": "Pass directives to PTX backend compiler.\n\nSyntax\n\n.pragma list-of-strings ;\n\nDescription\n\nPass module-scoped, entry-scoped, or statement-level directives to the PTX backend compiler.\n\nThe .pragma directive may occur at module-scope, at entry-scope, or at statement-level.\n\nSemantics\n\nThe interpretation of .pragma directive strings is implementation-specific and has no impact on\n\nPTX semantics. See Descriptions of .pragma Strings for\n\ndescriptions of the pragma strings defined in ptxas.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.pragma \"nounroll\";    // disable unrolling in backend\n\n// disable unrolling for current kernel\n\n.entry foo .pragma \"nounroll\"; { ... }\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-pragma"
            };

        case "prefetch":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu\" target=\"_blank\" rel=\"noopener noreferrer\">prefetch <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: prefetch, prefetchu</h1><section id=\"data-movement-and-conversion-instructions-prefetch-prefetchu\">\n\n\n<p>Prefetch line containing a generic address at a specified level of memory hierarchy, in specified\nstate space.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\">\n<div class=\"highlight\"><pre><span></span>prefetch{.space}.level                    [a];   // prefetch to data cache\nprefetch.global.level::eviction_priority  [a];   // prefetch to data cache\n\nprefetchu.L1  [a];             // prefetch to uniform cache\n\nprefetch{.tensormap_space}.tensormap [a];  // prefetch the tensormap\n\n.space =                    { .global, .local };\n.level =                    { .L1, .L2 };\n.level::eviction_priority = { .L2::evict_last, .L2::evict_normal };\n.tensormap_space =          { .const, .param };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> instruction brings the cache line containing the specified address in global or\nlocal memory state space into the specified cache level.</p>\n<p>If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.tensormap</span></code> qualifier is specified then the <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> instruction brings the cache line\ncontaining the specified address in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> memory state space for subsequent\nuse by the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.tensor</span></code> instruction.</p>\n<p>If no state space is given, the <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> uses <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a>.</p>\n<p>Optionally, the eviction priority to be applied on the prefetched cache line can be specified by the\nmodifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code>.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses\nas Operands</a></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetchu</span></code> instruction brings the cache line containing the specified generic address into\nthe specified uniform cache level.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> to a shared memory location performs no operation.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> into the uniform cache requires a generic address, and no operation occurs if the\naddress maps to a <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code>, <code class=\"docutils literal 