RegExp.cpp   [plain text]


/*
 *  Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org)
 *  Copyright (c) 2007, 2008 Apple Inc. All rights reserved.
 *  Copyright (C) 2009 Torch Mobile, Inc.
 *  Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 */

#include "config.h"
#include "RegExp.h"

#include "Lexer.h"
#include "RegExpCache.h"
#include "yarr/Yarr.h"
#include "yarr/YarrJIT.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wtf/Assertions.h>
#include <wtf/OwnArrayPtr.h>

namespace JSC {

const ClassInfo RegExp::s_info = { "RegExp", 0, 0, 0 };

RegExpFlags regExpFlags(const UString& string)
{
    RegExpFlags flags = NoFlags;

    for (unsigned i = 0; i < string.length(); ++i) {
        switch (string.characters()[i]) {
        case 'g':
            if (flags & FlagGlobal)
                return InvalidFlags;
            flags = static_cast<RegExpFlags>(flags | FlagGlobal);
            break;

        case 'i':
            if (flags & FlagIgnoreCase)
                return InvalidFlags;
            flags = static_cast<RegExpFlags>(flags | FlagIgnoreCase);
            break;

        case 'm':
            if (flags & FlagMultiline)
                return InvalidFlags;
            flags = static_cast<RegExpFlags>(flags | FlagMultiline);
            break;

        default:
            return InvalidFlags;
        }
    }

    return flags;
}
  
struct RegExpRepresentation {
#if ENABLE(YARR_JIT)
    Yarr::YarrCodeBlock m_regExpJITCode;
#endif
    OwnPtr<Yarr::BytecodePattern> m_regExpBytecode;
};

RegExp::RegExp(JSGlobalData* globalData, const UString& patternString, RegExpFlags flags)
    : JSCell(*globalData, globalData->regExpStructure.get())
    , m_state(NotCompiled)
    , m_patternString(patternString)
    , m_flags(flags)
    , m_constructionError(0)
    , m_numSubpatterns(0)
#if ENABLE(REGEXP_TRACING)
    , m_rtMatchCallCount(0)
    , m_rtMatchFoundCount(0)
#endif
{
    Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
    if (m_constructionError)
        m_state = ParseError;
    else
        m_numSubpatterns = pattern.m_numSubpatterns;
}

RegExp::~RegExp()
{
}

RegExp* RegExp::create(JSGlobalData* globalData, const UString& patternString, RegExpFlags flags)
{
    return globalData->regExpCache()->lookupOrCreate(patternString, flags);
}

void RegExp::compile(JSGlobalData* globalData)
{
    ASSERT(m_state == NotCompiled);
    m_representation = adoptPtr(new RegExpRepresentation);
    m_state = Compiling;
    Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
    if (m_constructionError) {
        ASSERT_NOT_REACHED();
        m_state = ParseError;
        return;
    }

    globalData->regExpCache()->addToStrongCache(this);

    ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);

#if ENABLE(YARR_JIT)
    if (!pattern.m_containsBackreferences && globalData->canUseJIT()) {
        Yarr::jitCompile(pattern, globalData, m_representation->m_regExpJITCode);
#if ENABLE(YARR_JIT_DEBUG)
        if (!m_representation->m_regExpJITCode.isFallBack())
            m_state = JITCode;
        else
            m_state = ByteCode;
#else
        if (!m_representation->m_regExpJITCode.isFallBack()) {
            m_state = JITCode;
            return;
        }
#endif
    }
#endif

    m_representation->m_regExpBytecode = Yarr::byteCompile(pattern, &globalData->m_regExpAllocator);

    m_state = ByteCode;
}

int RegExp::match(JSGlobalData& globalData, const UString& s, int startOffset, Vector<int, 32>* ovector)
{
    if (startOffset < 0)
        startOffset = 0;

#if ENABLE(REGEXP_TRACING)
    m_rtMatchCallCount++;
#endif

    if (static_cast<unsigned>(startOffset) > s.length() || s.isNull())
        return -1;

    if (m_state != ParseError) {
        compileIfNecessary(globalData);

        int offsetVectorSize = (m_numSubpatterns + 1) * 2;
        int* offsetVector;
        Vector<int, 32> nonReturnedOvector;
        if (ovector) {
            ovector->resize(offsetVectorSize);
            offsetVector = ovector->data();
        } else {
            nonReturnedOvector.resize(offsetVectorSize);
            offsetVector = nonReturnedOvector.data();
        }

        ASSERT(offsetVector);
        // Initialize offsetVector with the return value (index 0) and the 
        // first subpattern start indicies (even index values) set to -1.
        // No need to init the subpattern end indicies.
        for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++)            
            offsetVector[j] = -1;

        int result;
#if ENABLE(YARR_JIT)
        if (m_state == JITCode) {
            result = Yarr::execute(m_representation->m_regExpJITCode, s.characters(), startOffset, s.length(), offsetVector);
#if ENABLE(YARR_JIT_DEBUG)
            matchCompareWithInterpreter(s, startOffset, offsetVector, result);
#endif
        } else
#endif
            result = Yarr::interpret(m_representation->m_regExpBytecode.get(), s.characters(), startOffset, s.length(), offsetVector);
        ASSERT(result >= -1);

#if ENABLE(REGEXP_TRACING)
        if (result != -1)
            m_rtMatchFoundCount++;
#endif

        return result;
    }

    return -1;
}

void RegExp::invalidateCode()
{
    if (!m_representation || m_state == Compiling)
        return;
    m_state = NotCompiled;
    m_representation.clear();
}

#if ENABLE(YARR_JIT_DEBUG)
void RegExp::matchCompareWithInterpreter(const UString& s, int startOffset, int* offsetVector, int jitResult)
{
    int offsetVectorSize = (m_numSubpatterns + 1) * 2;
    Vector<int, 32> interpreterOvector;
    interpreterOvector.resize(offsetVectorSize);
    int* interpreterOffsetVector = interpreterOvector.data();
    int interpreterResult = 0;
    int differences = 0;

    // Initialize interpreterOffsetVector with the return value (index 0) and the 
    // first subpattern start indicies (even index values) set to -1.
    // No need to init the subpattern end indicies.
    for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++)
        interpreterOffsetVector[j] = -1;

    interpreterResult = Yarr::interpret(m_representation->m_regExpBytecode.get(), s.characters(), startOffset, s.length(), interpreterOffsetVector);

    if (jitResult != interpreterResult)
        differences++;

    for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++)
        if ((offsetVector[j] != interpreterOffsetVector[j])
            || ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1])))
            differences++;

    if (differences) {
        fprintf(stderr, "RegExp Discrepency for /%s/\n    string input ", pattern().utf8().data());
        unsigned segmentLen = s.length() - static_cast<unsigned>(startOffset);

        fprintf(stderr, (segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n", s.utf8().data() + startOffset);

        if (jitResult != interpreterResult) {
            fprintf(stderr, "    JIT result = %d, blah interpreted result = %d\n", jitResult, interpreterResult);
            differences--;
        } else {
            fprintf(stderr, "    Correct result = %d\n", jitResult);
        }

        if (differences) {
            for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) {
                if (offsetVector[j] != interpreterOffsetVector[j])
                    fprintf(stderr, "    JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j, offsetVector[j], j, interpreterOffsetVector[j]);
                if ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1]))
                    fprintf(stderr, "    JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j+1, offsetVector[j+1], j+1, interpreterOffsetVector[j+1]);
            }
        }
    }
}
#endif

#if ENABLE(REGEXP_TRACING)
    void RegExp::printTraceData()
    {
        char formattedPattern[41];
        char rawPattern[41];

        strncpy(rawPattern, pattern().utf8().data(), 40);
        rawPattern[40]= '\0';

        int pattLen = strlen(rawPattern);

        snprintf(formattedPattern, 41, (pattLen <= 38) ? "/%.38s/" : "/%.36s...", rawPattern);

#if ENABLE(YARR_JIT)
        Yarr::YarrCodeBlock& codeBlock = m_representation->m_regExpJITCode;

        const size_t jitAddrSize = 20;
        char jitAddr[jitAddrSize];
        if (m_state == JITCode)
            snprintf(jitAddr, jitAddrSize, "fallback");
        else
            snprintf(jitAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.getAddr()));
#else
        const char* jitAddr = "JIT Off";
#endif

        printf("%-40.40s %16.16s %10d %10d\n", formattedPattern, jitAddr, m_rtMatchCallCount, m_rtMatchFoundCount);
    }
#endif
    
} // namespace JSC