RegularExpression.cpp   [plain text]


/*
 * Copyright (C) 2004 Apple Computer, Inc.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

#include "config.h"
#include "RegularExpression.h"

#include "Logging.h"
#include "Shared.h"
#include <pcre/pcre.h>
#include <sys/types.h>

namespace WebCore {

const size_t maxSubstrings = 10;
const size_t maxOffsets = 3 * maxSubstrings;

class RegularExpression::Private : public Shared<RegularExpression::Private>
{
public:
    Private();
    Private(DeprecatedString pattern, bool caseSensitive, bool glob);
    ~Private();

    void compile(bool caseSensitive, bool glob);

    DeprecatedString pattern;
    pcre *regex;

    DeprecatedString lastMatchString;
    int lastMatchOffsets[maxOffsets];
    int lastMatchCount;
    int lastMatchPos;
    int lastMatchLength;
};

RegularExpression::Private::Private() : pattern("")
{
    compile(true, false);
}

RegularExpression::Private::Private(DeprecatedString p, bool caseSensitive, bool glob) : pattern(p), lastMatchPos(-1), lastMatchLength(-1)
{
    compile(caseSensitive, glob);
}

static DeprecatedString RegExpFromGlob(DeprecatedString glob)
{
    DeprecatedString result = glob;

    // escape regexp metacharacters which are NOT glob metacharacters

    result.replace(RegularExpression("\\\\"), "\\\\");
    result.replace(RegularExpression("\\."), "\\.");
    result.replace(RegularExpression("\\+"), "\\+");
    result.replace(RegularExpression("\\$"), "\\$");
    // FIXME: incorrect for ^ inside bracket group
    result.replace(RegularExpression("\\^"), "\\^");

    // translate glob metacharacters into regexp metacharacters
    result.replace(RegularExpression("\\*"), ".*");
    result.replace(RegularExpression("\\?"), ".");
   
    // Require the glob to match the whole string
    result = "^" + result + "$";

    return result;
}

void RegularExpression::Private::compile(bool caseSensitive, bool glob)
{
    DeprecatedString p;

    if (glob) {
        p = RegExpFromGlob(pattern);
    } else {
        p = pattern;
    }
    // Note we don't honor the Qt syntax for various character classes.  If we convert
    // to a different underlying engine, we may need to change client code that relies
    // on the regex syntax (see FrameMac.mm for a couple examples).
    
    const char *errorMessage;
    int errorOffset;
    regex = pcre_compile(reinterpret_cast<const uint16_t *>(p.unicode()), p.length(), caseSensitive ? 0 : PCRE_CASELESS, &errorMessage, &errorOffset, NULL);
    if (regex == NULL) {
        LOG_ERROR("RegularExpression: pcre_compile failed with '%s'", errorMessage);
    }
}

RegularExpression::Private::~Private()
{
    pcre_free(regex);
}


RegularExpression::RegularExpression() : d(new RegularExpression::Private())
{
}

RegularExpression::RegularExpression(const DeprecatedString &pattern, bool caseSensitive, bool glob) : d(new RegularExpression::Private(pattern, caseSensitive, glob))
{
}

RegularExpression::RegularExpression(const char *cpattern) : d(new RegularExpression::Private(cpattern, true, false))
{
}


RegularExpression::RegularExpression(const RegularExpression &re) : d (re.d)
{
}

RegularExpression::~RegularExpression()
{
}

RegularExpression &RegularExpression::operator=(const RegularExpression &re)
{
    RegularExpression tmp(re);
    RefPtr<RegularExpression::Private> tmpD = tmp.d;
    
    tmp.d = d;
    d = tmpD;

    return *this;
}

DeprecatedString RegularExpression::pattern() const
{
    return d->pattern;
}

int RegularExpression::match(const DeprecatedString &str, int startFrom, int *matchLength) const
{
    d->lastMatchString = str;
    // First 2 offsets are start and end offsets; 3rd entry is used internally by pcre
    d->lastMatchCount = pcre_exec(d->regex, NULL, reinterpret_cast<const uint16_t *>(d->lastMatchString.unicode()), d->lastMatchString.length(), startFrom, startFrom == 0 ? 0 : PCRE_NOTBOL, d->lastMatchOffsets, maxOffsets);
    if (d->lastMatchCount < 0) {
        if (d->lastMatchCount != PCRE_ERROR_NOMATCH)
            LOG_ERROR("RegularExpression: pcre_exec() failed with result %d", d->lastMatchCount);
        d->lastMatchPos = -1;
        d->lastMatchLength = -1;
        d->lastMatchString = DeprecatedString();
        return -1;
    }
    
    // 1 means 1 match; 0 means more than one match. First match is recorded in offsets.
    //ASSERT(d->lastMatchCount < 2);
    d->lastMatchPos = d->lastMatchOffsets[0];
    d->lastMatchLength = d->lastMatchOffsets[1] - d->lastMatchOffsets[0];
    if (matchLength != NULL) {
        *matchLength = d->lastMatchLength;
    }
    return d->lastMatchPos;
}

int RegularExpression::search(const DeprecatedString &str, int startFrom) const
{
    if (startFrom < 0) {
        startFrom = str.length() - startFrom;
    }
    return match(str, startFrom, NULL);
}

int RegularExpression::searchRev(const DeprecatedString &str) const
{
    // FIXME: Total hack for now.  Search forward, return the last, greedy match
    int start = 0;
    int pos;
    int lastPos = -1;
    int lastMatchLength = -1;
    do {
        int matchLength;
        pos = match(str, start, &matchLength);
        if (pos >= 0) {
            if ((pos+matchLength) > (lastPos+lastMatchLength)) {
                // replace last match if this one is later and not a subset of the last match
                lastPos = pos;
                lastMatchLength = matchLength;
            }
            start = pos + 1;
        }
    } while (pos != -1);
    d->lastMatchPos = lastPos;
    d->lastMatchLength = lastMatchLength;
    return lastPos;
}

int RegularExpression::pos(int n)
{
    ASSERT(n == 0);
    return d->lastMatchPos;
}

int RegularExpression::matchedLength() const
{
    return d->lastMatchLength;
}

DeprecatedString RegularExpression::cap(int n) const
{
    const pcre_char *substring = NULL;
    int substringLength = pcre_get_substring(reinterpret_cast<const uint16_t *>(d->lastMatchString.unicode()), d->lastMatchOffsets, d->lastMatchCount, n, &substring);
    if (substringLength > 0) {
       DeprecatedString capture(reinterpret_cast<const DeprecatedChar *>(substring), substringLength);
       pcre_free_substring(substring);
       return capture;
    }
    return DeprecatedString();
}

}