WebVTTParser.cpp [plain text]

/*
 * Copyright (C) 2011, 2013 Google Inc.  All rights reserved.
 * Copyright (C) 2013 Cable Television Labs, Inc.
 * Copyright (C) 2011-2020 Apple Inc.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *     * Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above
 * copyright notice, this list of conditions and the following disclaimer
 * in the documentation and/or other materials provided with the
 * distribution.
 *     * Neither the name of Google Inc. nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"

#if ENABLE(VIDEO_TRACK)

#include "WebVTTParser.h"

#include "HTMLParserIdioms.h"
#include "ISOVTTCue.h"
#include "ProcessingInstruction.h"
#include "StyleRule.h"
#include "StyleRuleImport.h"
#include "StyleSheetContents.h"
#include "Text.h"
#include "VTTScanner.h"
#include "WebVTTElement.h"
#include "WebVTTTokenizer.h"

namespace WebCore {

const double secondsPerHour = 3600;
const double secondsPerMinute = 60;
const double secondsPerMillisecond = 0.001;
const char* fileIdentifier = "WEBVTT";
const unsigned fileIdentifierLength = 6;
const unsigned regionIdentifierLength = 6;
const unsigned styleIdentifierLength = 5;

bool WebVTTParser::parseFloatPercentageValue(VTTScanner& valueScanner, float& percentage)
{
    float number;
    if (!valueScanner.scanFloat(number))
        return false;
    // '%' must be present and at the end of the setting value.
    if (!valueScanner.scan('%'))
        return false;

    if (number < 0 || number > 100)
        return false;

    percentage = number;
    return true;
}

bool WebVTTParser::parseFloatPercentageValuePair(VTTScanner& valueScanner, char delimiter, FloatPoint& valuePair)
{
    float firstCoord;
    if (!parseFloatPercentageValue(valueScanner, firstCoord))
        return false;

    if (!valueScanner.scan(delimiter))
        return false;

    float secondCoord;
    if (!parseFloatPercentageValue(valueScanner, secondCoord))
        return false;

    valuePair = FloatPoint(firstCoord, secondCoord);
    return true;
}

WebVTTParser::WebVTTParser(WebVTTParserClient* client, ScriptExecutionContext* context)
    : m_scriptExecutionContext(context)
    , m_state(Initial)
    , m_decoder(TextResourceDecoder::create("text/plain", UTF8Encoding()))
    , m_client(client)
{
}

void WebVTTParser::getNewCues(Vector<RefPtr<WebVTTCueData>>& outputCues)
{
    outputCues = WTFMove(m_cuelist);
}

void WebVTTParser::getNewRegions(Vector<RefPtr<VTTRegion>>& outputRegions)
{
    outputRegions = WTFMove(m_regionList);
}

Vector<String> WebVTTParser::getStyleSheets()
{
    return WTFMove(m_styleSheets);
}

void WebVTTParser::parseFileHeader(String&& data)
{
    m_state = Initial;
    m_lineReader.reset();
    m_lineReader.append(WTFMove(data));
    parse();
}

void WebVTTParser::parseBytes(const char* data, unsigned length)
{
    m_lineReader.append(m_decoder->decode(data, length));
    parse();
}

void WebVTTParser::parseCueData(const ISOWebVTTCue& data)
{
    auto cue = WebVTTCueData::create();

    MediaTime startTime = data.presentationTime();
    cue->setStartTime(startTime);
    cue->setEndTime(startTime + data.duration());

    cue->setContent(data.cueText());
    cue->setId(data.id());
    cue->setSettings(data.settings());

    MediaTime originalStartTime;
    if (WebVTTParser::collectTimeStamp(data.originalStartTime(), originalStartTime))
        cue->setOriginalStartTime(originalStartTime);

    m_cuelist.append(WTFMove(cue));
    if (m_client)
        m_client->newCuesParsed();
}

void WebVTTParser::flush()
{
    m_lineReader.append(m_decoder->flush());
    m_lineReader.appendEndOfStream();
    parse();
    flushPendingCue();
}

void WebVTTParser::parse()
{    
    // WebVTT parser algorithm. (5.1 WebVTT file parsing.)
    // Steps 1 - 3 - Initial setup.
    while (auto line = m_lineReader.nextLine()) {
        switch (m_state) {
        case Initial:
            // Steps 4 - 9 - Check for a valid WebVTT signature.
            if (!hasRequiredFileIdentifier(*line)) {
                if (m_client)
                    m_client->fileFailedToParse();
                return;
            }

            m_state = Header;
            break;

        case Header:
            // Steps 11 - 14 - Collect WebVTT block
            m_state = collectWebVTTBlock(*line);
            break;

        case Region:
            m_state = collectRegionSettings(*line);
            break;

        case Style:
            m_state = collectStyleSheet(*line);
            break;

        case Id:
            // Steps 17 - 20 - Allow any number of line terminators, then initialize new cue values.
            if (line->isEmpty())
                break;

            // Step 21 - Cue creation (start a new cue).
            resetCueValues();

            // Steps 22 - 25 - Check if this line contains an optional identifier or timing data.
            m_state = collectCueId(*line);
            break;

        case TimingsAndSettings:
            // Steps 26 - 27 - Discard current cue if the line is empty.
            if (line->isEmpty()) {
                m_state = Id;
                break;
            }

            // Steps 28 - 29 - Collect cue timings and settings.
            m_state = collectTimingsAndSettings(*line);
            break;

        case CueText:
            // Steps 31 - 41 - Collect the cue text, create a cue, and add it to the output.
            m_state = collectCueText(*line);
            break;

        case BadCue:
            // Steps 42 - 48 - Discard lines until an empty line or a potential timing line is seen.
            m_state = ignoreBadCue(*line);
            break;

        case Finished:
            ASSERT_NOT_REACHED();
            break;
        }
    }
}

void WebVTTParser::fileFinished()
{
    ASSERT(m_state != Finished);
    parseBytes("\n\n", 2);
    m_state = Finished;
}

void WebVTTParser::flushPendingCue()
{
    ASSERT(m_lineReader.isAtEndOfStream());
    // If we're in the CueText state when we run out of data, we emit the pending cue.
    if (m_state == CueText)
        createNewCue();
}

bool WebVTTParser::hasRequiredFileIdentifier(const String& line)
{
    // A WebVTT file identifier consists of an optional BOM character,
    // the string "WEBVTT" followed by an optional space or tab character,
    // and any number of characters that are not line terminators ...
    if (!line.startsWith(fileIdentifier))
        return false;
    if (line.length() > fileIdentifierLength && !isHTMLSpace(line[fileIdentifierLength]))
        return false;
    return true;
}

WebVTTParser::ParseState WebVTTParser::collectRegionSettings(const String& line)
{
    // End of region block
    if (checkAndStoreRegion(line))
        return checkAndRecoverCue(line);
    
    m_currentRegion->setRegionSettings(line);
    return Region;
}

WebVTTParser::ParseState WebVTTParser::collectWebVTTBlock(const String& line)
{
    // collect a WebVTT block parsing. (WebVTT parser algorithm step 14)
    
    if (checkAndCreateRegion(line))
        return Region;
    
    if (checkStyleSheet(line))
        return Style;

    // Handle cue block.
    ParseState state = checkAndRecoverCue(line);
    if (state != Header) {
        if (m_client) {
            if (!m_regionList.isEmpty())
                m_client->newRegionsParsed();
            if (!m_styleSheets.isEmpty())
                m_client->newStyleSheetsParsed();
        }
        if (!m_previousLine.isEmpty() && !m_previousLine.contains("-->"))
            m_currentId = m_previousLine;
        
        return state;
    }
    
    // store previous line for cue id.
    // length is more than 1 line clear m_previousLine and ignore line.
    if (m_previousLine.isEmpty())
        m_previousLine = line;
    else
        m_previousLine = emptyString();

    return state;
}

WebVTTParser::ParseState WebVTTParser::checkAndRecoverCue(const String& line)
{
    // parse cue timings and settings
    if (line.contains("-->")) {
        ParseState state = recoverCue(line);
        if (state != BadCue)
            return state;
    }
    return Header;
}

WebVTTParser::ParseState WebVTTParser::collectStyleSheet(const String& line)
{
    // End of style block
    if (checkAndStoreStyleSheet(line))
        return checkAndRecoverCue(line);

    m_currentSourceStyleSheet.append(line);
    return Style;
}

bool WebVTTParser::checkAndCreateRegion(const String& line)
{
    if (m_previousLine.contains("-->"))
        return false;
    // line starts with the substring "REGION" and remaining characters
    // zero or more U+0020 SPACE characters or U+0009 CHARACTER TABULATION
    // (tab) characters expected other than these charecters it is invalid.
    if (line.startsWith("REGION") && line.substring(regionIdentifierLength).isAllSpecialCharacters<isASpace>()) {
        m_currentRegion = VTTRegion::create(*m_scriptExecutionContext);
        return true;
    }
    return false;
}

bool WebVTTParser::checkAndStoreRegion(const String& line)
{
    if (!line.isEmpty() && !line.contains("-->"))
        return false;
    
    if (!m_currentRegion->id().isEmpty()) {
        // If the text track list of regions regions contains a region
        // with the same region identifier value as region, remove that region.
        for (const auto& region : m_regionList) {
            if (region->id() == m_currentRegion->id()) {
                m_regionList.removeFirst(region);
                break;
            }
        }
        m_regionList.append(m_currentRegion);
    }
    m_currentRegion = nullptr;
    return true;
}

bool WebVTTParser::checkStyleSheet(const String& line)
{
    if (m_previousLine.contains("-->"))
        return false;
    // line starts with the substring "STYLE" and remaining characters
    // zero or more U+0020 SPACE characters or U+0009 CHARACTER TABULATION
    // (tab) characters expected other than these charecters it is invalid.
    if (line.startsWith("STYLE") && line.substring(styleIdentifierLength).isAllSpecialCharacters<isASpace>())
        return true;

    return false;
}

bool WebVTTParser::checkAndStoreStyleSheet(const String& line)
{
    if (!line.isEmpty() && !line.contains("-->"))
        return false;
    
    auto styleSheetText = WTFMove(m_currentSourceStyleSheet);

    // WebVTTMode disallows non-data URLs.
    auto contents = StyleSheetContents::create(CSSParserContext(WebVTTMode));
    if (!contents->parseString(styleSheetText))
        return true;

    auto& namespaceRules = contents->namespaceRules();
    if (namespaceRules.size())
        return true;

    auto& importRules = contents->importRules();
    if (importRules.size())
        return true;

    auto& childRules = contents->childRules();
    if (!childRules.size())
        return true;

    StringBuilder sanitizedStyleSheetBuilder;
    
    for (const auto& rule : childRules) {
        if (!rule->isStyleRule())
            return true;
        const auto& styleRule = downcast<StyleRule>(*rule);

        const auto& selectorList = styleRule.selectorList();
        if (selectorList.listSize() != 1)
            return true;
        auto selector = selectorList.selectorAt(0);
        auto selectorText = selector->selectorText();
        
        bool isCue = selectorText == "::cue" || selectorText.startsWith("::cue(");
        if (!isCue)
            return true;

        if (styleRule.properties().isEmpty())
            continue;

        sanitizedStyleSheetBuilder.append(selectorText, " { ", styleRule.properties().asText(), "  }\n");
    }

    // It would be more stylish to parse the stylesheet only once instead of serializing a sanitized version.
    if (!sanitizedStyleSheetBuilder.isEmpty())
        m_styleSheets.append(sanitizedStyleSheetBuilder.toString());

    return true;
}

WebVTTParser::ParseState WebVTTParser::collectCueId(const String& line)
{
    if (line.contains("-->"))
        return collectTimingsAndSettings(line);
    m_currentId = line;
    return TimingsAndSettings;
}

WebVTTParser::ParseState WebVTTParser::collectTimingsAndSettings(const String& line)
{
    if (line.isEmpty())
        return BadCue;

    VTTScanner input(line);

    // Collect WebVTT cue timings and settings. (5.3 WebVTT cue timings and settings parsing.)
    // Steps 1 - 3 - Let input be the string being parsed and position be a pointer into input
    input.skipWhile<isHTMLSpace<UChar>>();

    // Steps 4 - 5 - Collect a WebVTT timestamp. If that fails, then abort and return failure. Otherwise, let cue's text track cue start time be the collected time.
    if (!collectTimeStamp(input, m_currentStartTime))
        return BadCue;
    
    input.skipWhile<isHTMLSpace<UChar>>();

    // Steps 6 - 9 - If the next three characters are not "-->", abort and return failure.
    if (!input.scan("-->"))
        return BadCue;
    
    input.skipWhile<isHTMLSpace<UChar>>();

    // Steps 10 - 11 - Collect a WebVTT timestamp. If that fails, then abort and return failure. Otherwise, let cue's text track cue end time be the collected time.
    if (!collectTimeStamp(input, m_currentEndTime))
        return BadCue;

    input.skipWhile<isHTMLSpace<UChar>>();

    // Step 12 - Parse the WebVTT settings for the cue (conducted in TextTrackCue).
    m_currentSettings = input.restOfInputAsString();
    return CueText;
}

WebVTTParser::ParseState WebVTTParser::collectCueText(const String& line)
{
    // Step 34.
    if (line.isEmpty()) {
        createNewCue();
        return Id;
    }
    // Step 35.
    if (line.contains("-->")) {
        // Step 39-40.
        createNewCue();

        // Step 41 - New iteration of the cue loop.
        return recoverCue(line);
    }
    if (!m_currentContent.isEmpty())
        m_currentContent.append('\n');
    m_currentContent.append(line);

    return CueText;
}

WebVTTParser::ParseState WebVTTParser::recoverCue(const String& line)
{
    // Step 17 and 21.
    resetCueValues();

    // Step 22.
    return collectTimingsAndSettings(line);
}

WebVTTParser::ParseState WebVTTParser::ignoreBadCue(const String& line)
{
    if (line.isEmpty())
        return Id;
    if (line.contains("-->"))
        return recoverCue(line);
    return BadCue;
}

// A helper class for the construction of a "cue fragment" from the cue text.
class WebVTTTreeBuilder {
public:
    WebVTTTreeBuilder(Document& document)
        : m_document(document) { }

    Ref<DocumentFragment> buildFromString(const String& cueText);

private:
    void constructTreeFromToken(Document&);

    WebVTTToken m_token;
    RefPtr<ContainerNode> m_currentNode;
    Vector<AtomString> m_languageStack;
    Document& m_document;
};

Ref<DocumentFragment> WebVTTTreeBuilder::buildFromString(const String& cueText)
{
    // Cue text processing based on
    // 5.4 WebVTT cue text parsing rules, and
    // 5.5 WebVTT cue text DOM construction rules.
    auto fragment = DocumentFragment::create(m_document);

    if (cueText.isEmpty()) {
        fragment->parserAppendChild(Text::create(m_document, emptyString()));
        return fragment;
    }

    m_currentNode = fragment.ptr();

    WebVTTTokenizer tokenizer(cueText);
    m_languageStack.clear();

    while (tokenizer.nextToken(m_token))
        constructTreeFromToken(m_document);
    
    return fragment;
}

Ref<DocumentFragment> WebVTTParser::createDocumentFragmentFromCueText(Document& document, const String& cueText)
{
    WebVTTTreeBuilder treeBuilder(document);
    return treeBuilder.buildFromString(cueText);
}

void WebVTTParser::createNewCue()
{
    auto cue = WebVTTCueData::create();
    cue->setStartTime(m_currentStartTime);
    cue->setEndTime(m_currentEndTime);
    cue->setContent(m_currentContent.toString());
    cue->setId(m_currentId);
    cue->setSettings(m_currentSettings);

    m_cuelist.append(WTFMove(cue));
    if (m_client)
        m_client->newCuesParsed();
}

void WebVTTParser::resetCueValues()
{
    m_currentId = emptyString();
    m_currentSettings = emptyString();
    m_currentStartTime = MediaTime::zeroTime();
    m_currentEndTime = MediaTime::zeroTime();
    m_currentContent.clear();
}

bool WebVTTParser::collectTimeStamp(const String& line, MediaTime& timeStamp)
{
    if (line.isEmpty())
        return false;

    VTTScanner input(line);
    return collectTimeStamp(input, timeStamp);
}

bool WebVTTParser::collectTimeStamp(VTTScanner& input, MediaTime& timeStamp)
{
    // Collect a WebVTT timestamp (5.3 WebVTT cue timings and settings parsing.)
    // Steps 1 - 4 - Initial checks, let most significant units be minutes.
    enum Mode { minutes, hours };
    Mode mode = minutes;

    // Steps 5 - 7 - Collect a sequence of characters that are 0-9.
    // If not 2 characters or value is greater than 59, interpret as hours.
    int value1;
    unsigned value1Digits = input.scanDigits(value1);
    if (!value1Digits)
        return false;
    if (value1Digits != 2 || value1 > 59)
        mode = hours;

    // Steps 8 - 11 - Collect the next sequence of 0-9 after ':' (must be 2 chars).
    int value2;
    if (!input.scan(':') || input.scanDigits(value2) != 2)
        return false;

    // Step 12 - Detect whether this timestamp includes hours.
    int value3;
    if (mode == hours || input.match(':')) {
        if (!input.scan(':') || input.scanDigits(value3) != 2)
            return false;
    } else {
        value3 = value2;
        value2 = value1;
        value1 = 0;
    }

    // Steps 13 - 17 - Collect next sequence of 0-9 after '.' (must be 3 chars).
    int value4;
    if (!input.scan('.') || input.scanDigits(value4) != 3)
        return false;
    if (value2 > 59 || value3 > 59)
        return false;

    // Steps 18 - 19 - Calculate result.
    timeStamp = MediaTime::createWithDouble((value1 * secondsPerHour) + (value2 * secondsPerMinute) + value3 + (value4 * secondsPerMillisecond));
    return true;
}

static WebVTTNodeType tokenToNodeType(WebVTTToken& token)
{
    switch (token.name().length()) {
    case 1:
        if (token.name()[0] == 'c')
            return WebVTTNodeTypeClass;
        if (token.name()[0] == 'v')
            return WebVTTNodeTypeVoice;
        if (token.name()[0] == 'b')
            return WebVTTNodeTypeBold;
        if (token.name()[0] == 'i')
            return WebVTTNodeTypeItalic;
        if (token.name()[0] == 'u')
            return WebVTTNodeTypeUnderline;
        break;
    case 2:
        if (token.name()[0] == 'r' && token.name()[1] == 't')
            return WebVTTNodeTypeRubyText;
        break;
    case 4:
        if (token.name()[0] == 'r' && token.name()[1] == 'u' && token.name()[2] == 'b' && token.name()[3] == 'y')
            return WebVTTNodeTypeRuby;
        if (token.name()[0] == 'l' && token.name()[1] == 'a' && token.name()[2] == 'n' && token.name()[3] == 'g')
            return WebVTTNodeTypeLanguage;
        break;
    }
    return WebVTTNodeTypeNone;
}

void WebVTTTreeBuilder::constructTreeFromToken(Document& document)
{
    // http://dev.w3.org/html5/webvtt/#webvtt-cue-text-dom-construction-rules

    switch (m_token.type()) {
    case WebVTTTokenTypes::Character: {
        m_currentNode->parserAppendChild(Text::create(document, m_token.characters()));
        break;
    }
    case WebVTTTokenTypes::StartTag: {
        WebVTTNodeType nodeType = tokenToNodeType(m_token);
        if (nodeType == WebVTTNodeTypeNone)
            break;

        WebVTTNodeType currentType = is<WebVTTElement>(*m_currentNode) ? downcast<WebVTTElement>(*m_currentNode).webVTTNodeType() : WebVTTNodeTypeNone;
        // <rt> is only allowed if the current node is <ruby>.
        if (nodeType == WebVTTNodeTypeRubyText && currentType != WebVTTNodeTypeRuby)
            break;

        auto child = WebVTTElement::create(nodeType, document);
        if (!m_token.classes().isEmpty())
            child->setAttributeWithoutSynchronization(classAttr, m_token.classes());

        if (nodeType == WebVTTNodeTypeVoice)
            child->setAttributeWithoutSynchronization(WebVTTElement::voiceAttributeName(), m_token.annotation());
        else if (nodeType == WebVTTNodeTypeLanguage) {
            m_languageStack.append(m_token.annotation());
            child->setAttributeWithoutSynchronization(WebVTTElement::langAttributeName(), m_languageStack.last());
        }
        if (!m_languageStack.isEmpty())
            child->setLanguage(m_languageStack.last());
        m_currentNode->parserAppendChild(child);
        m_currentNode = WTFMove(child);
        break;
    }
    case WebVTTTokenTypes::EndTag: {
        WebVTTNodeType nodeType = tokenToNodeType(m_token);
        if (nodeType == WebVTTNodeTypeNone)
            break;
        
        // The only non-VTTElement would be the DocumentFragment root. (Text
        // nodes and PIs will never appear as m_currentNode.)
        if (!is<WebVTTElement>(*m_currentNode))
            break;

        WebVTTNodeType currentType = downcast<WebVTTElement>(*m_currentNode).webVTTNodeType();
        bool matchesCurrent = nodeType == currentType;
        if (!matchesCurrent) {
            // </ruby> auto-closes <rt>
            if (currentType == WebVTTNodeTypeRubyText && nodeType == WebVTTNodeTypeRuby) {
                if (m_currentNode->parentNode())
                    m_currentNode = m_currentNode->parentNode();
            } else
                break;
        }
        if (nodeType == WebVTTNodeTypeLanguage)
            m_languageStack.removeLast();
        if (m_currentNode->parentNode())
            m_currentNode = m_currentNode->parentNode();
        break;
    }
    case WebVTTTokenTypes::TimestampTag: {
        String charactersString = m_token.characters();
        MediaTime parsedTimeStamp;
        if (WebVTTParser::collectTimeStamp(charactersString, parsedTimeStamp))
            m_currentNode->parserAppendChild(ProcessingInstruction::create(document, "timestamp", charactersString));
        break;
    }
    default:
        break;
    }
}

}

#endif