AirFixPartialRegisterStalls.cpp   [plain text]


/*
 * Copyright (C) 2015-2017 Apple Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"
#include "AirFixPartialRegisterStalls.h"

#if ENABLE(B3_JIT)

#include "AirBasicBlock.h"
#include "AirCode.h"
#include "AirInsertionSet.h"
#include "AirInst.h"
#include "AirInstInlines.h"
#include "AirPhaseScope.h"
#include "MacroAssembler.h"
#include <wtf/IndexMap.h>
#include <wtf/IndexSet.h>
#include <wtf/Vector.h>

namespace JSC { namespace B3 { namespace Air {

namespace {

bool hasPartialXmmRegUpdate(const Inst& inst)
{
    switch (inst.kind.opcode) {
    case ConvertDoubleToFloat:
    case ConvertFloatToDouble:
    case ConvertInt32ToDouble:
    case ConvertInt64ToDouble:
    case ConvertInt32ToFloat:
    case ConvertInt64ToFloat:
    case SqrtDouble:
    case SqrtFloat:
    case CeilDouble:
    case CeilFloat:
    case FloorDouble:
    case FloorFloat:
        return true;
    default:
        break;
    }
    return false;
}

bool isDependencyBreaking(const Inst& inst)
{
    // "xorps reg, reg" is used by the frontend to remove the dependency on its argument.
    return inst.kind.opcode == MoveZeroToDouble;
}

// FIXME: find a good distance per architecture experimentally.
// LLVM uses a distance of 16 but that comes from Nehalem.
unsigned char minimumSafeDistance = 16;

struct FPDefDistance {
    FPDefDistance()
    {
        for (unsigned i = 0; i < MacroAssembler::numberOfFPRegisters(); ++i)
            distance[i] = 255;
    }

    void reset(FPRReg reg)
    {
        unsigned index = MacroAssembler::fpRegisterIndex(reg);
        distance[index] = 255;
    }

    void add(FPRReg reg, unsigned registerDistance)
    {
        unsigned index = MacroAssembler::fpRegisterIndex(reg);
        if (registerDistance < distance[index])
            distance[index] = static_cast<unsigned char>(registerDistance);
    }

    bool updateFromPrecessor(FPDefDistance& precessorDistance, unsigned constantOffset = 0)
    {
        bool changed = false;
        for (unsigned i = 0; i < MacroAssembler::numberOfFPRegisters(); ++i) {
            unsigned regDistance = precessorDistance.distance[i] + constantOffset;
            if (regDistance < minimumSafeDistance && regDistance < distance[i]) {
                distance[i] = regDistance;
                changed = true;
            }
        }
        return changed;
    }

    unsigned char distance[MacroAssembler::numberOfFPRegisters()];
};

void updateDistances(Inst& inst, FPDefDistance& localDistance, unsigned& distanceToBlockEnd)
{
    --distanceToBlockEnd;

    if (isDependencyBreaking(inst)) {
        localDistance.reset(inst.args[0].tmp().fpr());
        return;
    }

    inst.forEachTmp([&] (Tmp& tmp, Arg::Role role, Bank, Width) {
        ASSERT_WITH_MESSAGE(tmp.isReg(), "This phase must be run after register allocation.");

        if (tmp.isFPR() && Arg::isAnyDef(role))
            localDistance.add(tmp.fpr(), distanceToBlockEnd);
    });
}

}

void fixPartialRegisterStalls(Code& code)
{
    if (!isX86())
        return;

    PhaseScope phaseScope(code, "fixPartialRegisterStalls");

    Vector<BasicBlock*> candidates;

    for (BasicBlock* block : code) {
        for (const Inst& inst : *block) {
            if (hasPartialXmmRegUpdate(inst)) {
                candidates.append(block);
                break;
            }
        }
    }

    // Fortunately, Partial Stalls are rarely used. Return early if no block
    // cares about them.
    if (candidates.isEmpty())
        return;

    // For each block, this provides the distance to the last instruction setting each register
    // on block *entry*.
    IndexMap<BasicBlock*, FPDefDistance> lastDefDistance(code.size());

    // Blocks with dirty distance at head.
    IndexSet<BasicBlock*> dirty;

    // First, we compute the local distance for each block and push it to the successors.
    for (BasicBlock* block : code) {
        FPDefDistance localDistance;

        unsigned distanceToBlockEnd = block->size();
        for (Inst& inst : *block)
            updateDistances(inst, localDistance, distanceToBlockEnd);

        for (BasicBlock* successor : block->successorBlocks()) {
            if (lastDefDistance[successor].updateFromPrecessor(localDistance))
                dirty.add(successor);
        }
    }

    // Now we propagate the minimums accross blocks.
    bool changed;
    do {
        changed = false;

        for (BasicBlock* block : code) {
            if (!dirty.remove(block))
                continue;

            // Little shortcut: if the block is big enough, propagating it won't add any information.
            if (block->size() >= minimumSafeDistance)
                continue;

            unsigned blockSize = block->size();
            FPDefDistance& blockDistance = lastDefDistance[block];
            for (BasicBlock* successor : block->successorBlocks()) {
                if (lastDefDistance[successor].updateFromPrecessor(blockDistance, blockSize)) {
                    dirty.add(successor);
                    changed = true;
                }
            }
        }
    } while (changed);

    // Finally, update each block as needed.
    InsertionSet insertionSet(code);
    for (BasicBlock* block : candidates) {
        unsigned distanceToBlockEnd = block->size();
        FPDefDistance& localDistance = lastDefDistance[block];

        for (unsigned i = 0; i < block->size(); ++i) {
            Inst& inst = block->at(i);

            if (hasPartialXmmRegUpdate(inst)) {
                RegisterSet defs;
                RegisterSet uses;
                inst.forEachTmp([&] (Tmp& tmp, Arg::Role role, Bank, Width) {
                    if (tmp.isFPR()) {
                        if (Arg::isAnyDef(role))
                            defs.set(tmp.fpr());
                        if (Arg::isAnyUse(role))
                            uses.set(tmp.fpr());
                    }
                });
                // We only care about values we define but not use. Otherwise we have to wait
                // for the value to be resolved anyway.
                defs.exclude(uses);

                defs.forEach([&] (Reg reg) {
                    if (localDistance.distance[MacroAssembler::fpRegisterIndex(reg.fpr())] < minimumSafeDistance)
                        insertionSet.insert(i, MoveZeroToDouble, inst.origin, Tmp(reg));
                });
            }

            updateDistances(inst, localDistance, distanceToBlockEnd);
        }
        insertionSet.execute(block);
    }
}

} } } // namespace JSC::B3::Air

#endif // ENABLE(B3_JIT)