URL.cpp [plain text]

/*
 * Copyright (C) 2004, 2007-2008, 2011-2013, 2015-2016 Apple Inc. All rights reserved.
 * Copyright (C) 2012 Research In Motion Limited. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

#include "config.h"
#include "URL.h"

#include "DecodeEscapeSequences.h"
#include "MIMETypeRegistry.h"
#include "TextEncoding.h"
#include "URLParser.h"
#include <stdio.h>
#include <unicode/uidna.h>
#include <wtf/HashMap.h>
#include <wtf/HexNumber.h>
#include <wtf/NeverDestroyed.h>
#include <wtf/StdLibExtras.h>
#include <wtf/UUID.h>
#include <wtf/text/CString.h>
#include <wtf/text/StringBuilder.h>
#include <wtf/text/StringHash.h>

// FIXME: This file makes too much use of the + operator on String.
// We either have to optimize that operator so it doesn't involve
// so many allocations, or change this to use StringBuffer instead.

using namespace WTF;

namespace WebCore {

typedef Vector<char, 512> CharBuffer;
typedef Vector<UChar, 512> UCharBuffer;

static const unsigned invalidPortNumber = 0xFFFF;

enum URLCharacterClasses {
    // alpha 
    SchemeFirstChar = 1 << 0,

    // ( alpha | digit | "+" | "-" | "." )
    SchemeChar = 1 << 1,

    // mark        = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
    // unreserved  = alphanum | mark
    // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," )
    UserInfoChar = 1 << 2,

    // alnum | "." | "-" | "%"
    // The above is what the specification says, but we are lenient to
    // match existing practice and also allow:
    // "_"
    HostnameChar = 1 << 3,

    // hexdigit | ":" | "%"
    IPv6Char = 1 << 4,

    // "#" | "?" | "/" | nul
    PathSegmentEndChar = 1 << 5,

    // not allowed in path
    BadChar = 1 << 6,

    // "\t" | "\n" | "\r"
    TabNewline = 1 << 7
};

static const unsigned char characterClassTable[256] = {
    /* 0 nul */ PathSegmentEndChar,    /* 1 soh */ BadChar,
    /* 2 stx */ BadChar,    /* 3 etx */ BadChar,
    /* 4 eot */ BadChar,    /* 5 enq */ BadChar,    /* 6 ack */ BadChar,    /* 7 bel */ BadChar,
    /* 8 bs */ BadChar,     /* 9 ht */ BadChar | TabNewline,                /* 10 nl */ BadChar | TabNewline,
    /* 11 vt */ BadChar,    /* 12 np */ BadChar,    /* 13 cr */ BadChar | TabNewline,
    /* 14 so */ BadChar,    /* 15 si */ BadChar,
    /* 16 dle */ BadChar,   /* 17 dc1 */ BadChar,   /* 18 dc2 */ BadChar,   /* 19 dc3 */ BadChar,
    /* 20 dc4 */ BadChar,   /* 21 nak */ BadChar,   /* 22 syn */ BadChar,   /* 23 etb */ BadChar,
    /* 24 can */ BadChar,   /* 25 em */ BadChar,    /* 26 sub */ BadChar,   /* 27 esc */ BadChar,
    /* 28 fs */ BadChar,    /* 29 gs */ BadChar,    /* 30 rs */ BadChar,    /* 31 us */ BadChar,
    /* 32 sp */ BadChar,    /* 33  ! */ UserInfoChar,
    /* 34  " */ BadChar,    /* 35  # */ PathSegmentEndChar | BadChar,
    /* 36  $ */ UserInfoChar,    /* 37  % */ UserInfoChar | HostnameChar | IPv6Char | BadChar,
    /* 38  & */ UserInfoChar,    /* 39  ' */ UserInfoChar,
    /* 40  ( */ UserInfoChar,    /* 41  ) */ UserInfoChar,
    /* 42  * */ UserInfoChar,    /* 43  + */ SchemeChar | UserInfoChar,
    /* 44  , */ UserInfoChar,
    /* 45  - */ SchemeChar | UserInfoChar | HostnameChar,
    /* 46  . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
    /* 47  / */ PathSegmentEndChar,
    /* 48  0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 
    /* 49  1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,    
    /* 50  2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 
    /* 51  3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
    /* 52  4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 
    /* 53  5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
    /* 54  6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 
    /* 55  7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
    /* 56  8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 
    /* 57  9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
    /* 58  : */ UserInfoChar | IPv6Char,    /* 59  ; */ UserInfoChar,
    /* 60  < */ BadChar,    /* 61  = */ UserInfoChar,
    /* 62  > */ BadChar,    /* 63  ? */ PathSegmentEndChar | BadChar,
    /* 64  @ */ 0,
    /* 65  A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,    
    /* 66  B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
    /* 67  C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
    /* 68  D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
    /* 69  E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
    /* 70  F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
    /* 71  G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 72  H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 73  I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 74  J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 75  K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 76  L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 77  M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 78  N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 79  O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 80  P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 81  Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 82  R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 83  S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 84  T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 85  U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 86  V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 87  W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 88  X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 
    /* 89  Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 90  Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 91  [ */ 0,
    /* 92  \ */ 0,    /* 93  ] */ 0,
    /* 94  ^ */ 0,
    /* 95  _ */ UserInfoChar | HostnameChar,
    /* 96  ` */ 0,
    /* 97  a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
    /* 98  b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 
    /* 99  c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
    /* 100  d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 
    /* 101  e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
    /* 102  f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 
    /* 103  g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 104  h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 
    /* 105  i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 106  j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 
    /* 107  k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 108  l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 
    /* 109  m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 110  n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 
    /* 111  o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 112  p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 
    /* 113  q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 114  r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 
    /* 115  s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 116  t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 
    /* 117  u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 118  v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 
    /* 119  w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 120  x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 
    /* 121  y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
    /* 122  z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 
    /* 123  { */ 0,
    /* 124  | */ 0,   /* 125  } */ 0,   /* 126  ~ */ UserInfoChar,   /* 127 del */ BadChar,
    /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar,
    /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar,
    /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar,
    /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar,
    /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar,
    /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar,
    /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar,
    /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar,
    /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar,
    /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar,
    /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar,
    /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar,
    /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar,
    /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar,
    /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar,
    /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar,
    /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar,
    /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar,
    /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar,
    /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar,
    /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar,
    /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar,
    /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar,
    /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar,
    /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar,
    /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar,
    /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar,
    /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar,
    /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar,
    /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar,
    /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar,
    /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar
};

enum PercentEncodeCharacterClass {
    // Class names match the URL Standard; each class is a superset of the previous one.
    PercentEncodeSimple = 255,
    PercentEncodeDefault = 127,
    PercentEncodePassword = 63,
    PercentEncodeUsername = 31,
};

static const unsigned char percentEncodeClassTable[256] = {
    /* 0 nul */ PercentEncodeSimple,    /* 1 soh */ PercentEncodeSimple,    /* 2 stx */ PercentEncodeSimple,    /* 3 etx */ PercentEncodeSimple,
    /* 4 eot */ PercentEncodeSimple,    /* 5 enq */ PercentEncodeSimple,    /* 6 ack */ PercentEncodeSimple,    /* 7 bel */ PercentEncodeSimple,
    /* 8 bs */ PercentEncodeSimple,     /* 9 ht */ PercentEncodeSimple,     /* 10 nl */ PercentEncodeSimple,    /* 11 vt */ PercentEncodeSimple,
    /* 12 np */ PercentEncodeSimple,    /* 13 cr */ PercentEncodeSimple,    /* 14 so */ PercentEncodeSimple,    /* 15 si */ PercentEncodeSimple,
    /* 16 dle */ PercentEncodeSimple,   /* 17 dc1 */ PercentEncodeSimple,   /* 18 dc2 */ PercentEncodeSimple,   /* 19 dc3 */ PercentEncodeSimple,
    /* 20 dc4 */ PercentEncodeSimple,   /* 21 nak */ PercentEncodeSimple,   /* 22 syn */ PercentEncodeSimple,   /* 23 etb */ PercentEncodeSimple,
    /* 24 can */ PercentEncodeSimple,   /* 25 em */ PercentEncodeSimple,    /* 26 sub */ PercentEncodeSimple,   /* 27 esc */ PercentEncodeSimple,
    /* 28 fs */ PercentEncodeSimple,    /* 29 gs */ PercentEncodeSimple,    /* 30 rs */ PercentEncodeSimple,    /* 31 us */ PercentEncodeSimple,
    /* 32 sp */ PercentEncodeDefault,
    /* 33  ! */ 0,
    /* 34  " */ PercentEncodeDefault,
    /* 35  # */ PercentEncodeDefault,
    /* 36  $ */ 0,
    /* 37  % */ 0,
    /* 38  & */ 0,
    /* 39  ' */ 0,
    /* 40  ( */ 0,
    /* 41  ) */ 0,
    /* 42  * */ 0,
    /* 43  + */ 0,
    /* 44  , */ 0,
    /* 45  - */ 0,
    /* 46  . */ 0,
    /* 47  / */ PercentEncodePassword,
    /* 48  0 */ 0,    /* 49  1 */ 0,    /* 50  2 */ 0,    /* 51  3 */ 0,
    /* 52  4 */ 0,    /* 53  5 */ 0,    /* 54  6 */ 0,    /* 55  7 */ 0,
    /* 56  8 */ 0,    /* 57  9 */ 0,
    /* 58  : */ PercentEncodeUsername,
    /* 59  ; */ 0,
    /* 60  < */ PercentEncodeDefault,
    /* 61  = */ 0,
    /* 62  > */ PercentEncodeDefault,
    /* 63  ? */ PercentEncodeDefault,
    /* 64  @ */ PercentEncodePassword,
    /* 65  A */ 0,    /* 66  B */ 0,    /* 67  C */ 0,    /* 68  D */ 0,
    /* 69  E */ 0,    /* 70  F */ 0,    /* 71  G */ 0,    /* 72  H */ 0,
    /* 73  I */ 0,    /* 74  J */ 0,    /* 75  K */ 0,    /* 76  L */ 0,
    /* 77  M */ 0,    /* 78  N */ 0,    /* 79  O */ 0,    /* 80  P */ 0,
    /* 81  Q */ 0,    /* 82  R */ 0,    /* 83  S */ 0,    /* 84  T */ 0,
    /* 85  U */ 0,    /* 86  V */ 0,    /* 87  W */ 0,    /* 88  X */ 0,
    /* 89  Y */ 0,    /* 90  Z */ 0,
    /* 91  [ */ 0,
    /* 92  \ */ PercentEncodePassword,
    /* 93  ] */ 0,
    /* 94  ^ */ 0,
    /* 95  _ */ 0,
    /* 96  ` */ PercentEncodeDefault,
    /* 97  a */ 0,    /* 98  b */ 0,    /* 99  c */ 0,    /* 100  d */ 0,
    /* 101  e */ 0,    /* 102  f */ 0,    /* 103  g */ 0,    /* 104  h */ 0,
    /* 105  i */ 0,    /* 106  j */ 0,    /* 107  k */ 0,    /* 108  l */ 0,
    /* 109  m */ 0,    /* 110  n */ 0,    /* 111  o */ 0,    /* 112  p */ 0,
    /* 113  q */ 0,    /* 114  r */ 0,    /* 115  s */ 0,    /* 116  t */ 0,
    /* 117  u */ 0,    /* 118  v */ 0,    /* 119  w */ 0,    /* 120  x */ 0,
    /* 121  y */ 0,    /* 122  z */ 0,
    /* 123  { */ 0,
    /* 124  | */ 0,
    /* 125  } */ 0,
    /* 126  ~ */ 0,
    /* 127 del */ PercentEncodeSimple,
    /* 128 */ PercentEncodeSimple, /* 129 */ PercentEncodeSimple, /* 130 */ PercentEncodeSimple, /* 131 */ PercentEncodeSimple,
    /* 132 */ PercentEncodeSimple, /* 133 */ PercentEncodeSimple, /* 134 */ PercentEncodeSimple, /* 135 */ PercentEncodeSimple,
    /* 136 */ PercentEncodeSimple, /* 137 */ PercentEncodeSimple, /* 138 */ PercentEncodeSimple, /* 139 */ PercentEncodeSimple,
    /* 140 */ PercentEncodeSimple, /* 141 */ PercentEncodeSimple, /* 142 */ PercentEncodeSimple, /* 143 */ PercentEncodeSimple,
    /* 144 */ PercentEncodeSimple, /* 145 */ PercentEncodeSimple, /* 146 */ PercentEncodeSimple, /* 147 */ PercentEncodeSimple,
    /* 148 */ PercentEncodeSimple, /* 149 */ PercentEncodeSimple, /* 150 */ PercentEncodeSimple, /* 151 */ PercentEncodeSimple,
    /* 152 */ PercentEncodeSimple, /* 153 */ PercentEncodeSimple, /* 154 */ PercentEncodeSimple, /* 155 */ PercentEncodeSimple,
    /* 156 */ PercentEncodeSimple, /* 157 */ PercentEncodeSimple, /* 158 */ PercentEncodeSimple, /* 159 */ PercentEncodeSimple,
    /* 160 */ PercentEncodeSimple, /* 161 */ PercentEncodeSimple, /* 162 */ PercentEncodeSimple, /* 163 */ PercentEncodeSimple,
    /* 164 */ PercentEncodeSimple, /* 165 */ PercentEncodeSimple, /* 166 */ PercentEncodeSimple, /* 167 */ PercentEncodeSimple,
    /* 168 */ PercentEncodeSimple, /* 169 */ PercentEncodeSimple, /* 170 */ PercentEncodeSimple, /* 171 */ PercentEncodeSimple,
    /* 172 */ PercentEncodeSimple, /* 173 */ PercentEncodeSimple, /* 174 */ PercentEncodeSimple, /* 175 */ PercentEncodeSimple,
    /* 176 */ PercentEncodeSimple, /* 177 */ PercentEncodeSimple, /* 178 */ PercentEncodeSimple, /* 179 */ PercentEncodeSimple,
    /* 180 */ PercentEncodeSimple, /* 181 */ PercentEncodeSimple, /* 182 */ PercentEncodeSimple, /* 183 */ PercentEncodeSimple,
    /* 184 */ PercentEncodeSimple, /* 185 */ PercentEncodeSimple, /* 186 */ PercentEncodeSimple, /* 187 */ PercentEncodeSimple,
    /* 188 */ PercentEncodeSimple, /* 189 */ PercentEncodeSimple, /* 190 */ PercentEncodeSimple, /* 191 */ PercentEncodeSimple,
    /* 192 */ PercentEncodeSimple, /* 193 */ PercentEncodeSimple, /* 194 */ PercentEncodeSimple, /* 195 */ PercentEncodeSimple,
    /* 196 */ PercentEncodeSimple, /* 197 */ PercentEncodeSimple, /* 198 */ PercentEncodeSimple, /* 199 */ PercentEncodeSimple,
    /* 200 */ PercentEncodeSimple, /* 201 */ PercentEncodeSimple, /* 202 */ PercentEncodeSimple, /* 203 */ PercentEncodeSimple,
    /* 204 */ PercentEncodeSimple, /* 205 */ PercentEncodeSimple, /* 206 */ PercentEncodeSimple, /* 207 */ PercentEncodeSimple,
    /* 208 */ PercentEncodeSimple, /* 209 */ PercentEncodeSimple, /* 210 */ PercentEncodeSimple, /* 211 */ PercentEncodeSimple,
    /* 212 */ PercentEncodeSimple, /* 213 */ PercentEncodeSimple, /* 214 */ PercentEncodeSimple, /* 215 */ PercentEncodeSimple,
    /* 216 */ PercentEncodeSimple, /* 217 */ PercentEncodeSimple, /* 218 */ PercentEncodeSimple, /* 219 */ PercentEncodeSimple,
    /* 220 */ PercentEncodeSimple, /* 221 */ PercentEncodeSimple, /* 222 */ PercentEncodeSimple, /* 223 */ PercentEncodeSimple,
    /* 224 */ PercentEncodeSimple, /* 225 */ PercentEncodeSimple, /* 226 */ PercentEncodeSimple, /* 227 */ PercentEncodeSimple,
    /* 228 */ PercentEncodeSimple, /* 229 */ PercentEncodeSimple, /* 230 */ PercentEncodeSimple, /* 231 */ PercentEncodeSimple,
    /* 232 */ PercentEncodeSimple, /* 233 */ PercentEncodeSimple, /* 234 */ PercentEncodeSimple, /* 235 */ PercentEncodeSimple,
    /* 236 */ PercentEncodeSimple, /* 237 */ PercentEncodeSimple, /* 238 */ PercentEncodeSimple, /* 239 */ PercentEncodeSimple,
    /* 240 */ PercentEncodeSimple, /* 241 */ PercentEncodeSimple, /* 242 */ PercentEncodeSimple, /* 243 */ PercentEncodeSimple,
    /* 244 */ PercentEncodeSimple, /* 245 */ PercentEncodeSimple, /* 246 */ PercentEncodeSimple, /* 247 */ PercentEncodeSimple,
    /* 248 */ PercentEncodeSimple, /* 249 */ PercentEncodeSimple, /* 250 */ PercentEncodeSimple, /* 251 */ PercentEncodeSimple,
    /* 252 */ PercentEncodeSimple, /* 253 */ PercentEncodeSimple, /* 254 */ PercentEncodeSimple, /* 255 */ PercentEncodeSimple
};

static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); }
static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); }
static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; }
static inline bool isTabNewline(UChar c) { return c <= 0xff && (characterClassTable[c] & TabNewline); }

String encodeWithURLEscapeSequences(const String& notEncodedString, PercentEncodeCharacterClass whatToEncode);

// Copies the source to the destination, assuming all the source characters are
// ASCII. The destination buffer must be large enough. Null characters are allowed
// in the source string, and no attempt is made to null-terminate the result.
static void copyASCII(const String& string, char* dest)
{
    if (string.isEmpty())
        return;

    if (string.is8Bit())
        memcpy(dest, string.characters8(), string.length());
    else {
        const UChar* src = string.characters16();
        size_t length = string.length();
        for (size_t i = 0; i < length; i++)
            dest[i] = static_cast<char>(src[i]);
    }
}

void URL::invalidate()
{
    m_isValid = false;
    m_protocolIsInHTTPFamily = false;
    m_cannotBeABaseURL = false;
    m_schemeEnd = 0;
    m_userStart = 0;
    m_userEnd = 0;
    m_passwordEnd = 0;
    m_hostEnd = 0;
    m_portEnd = 0;
    m_pathEnd = 0;
    m_pathAfterLastSlash = 0;
    m_queryEnd = 0;
}

URL::URL(ParsedURLStringTag, const String& url)
{
    URLParser parser(url);
    *this = parser.result();

#if OS(WINDOWS)
    // FIXME(148598): Work around Windows local file handling bug in CFNetwork
    ASSERT(isLocalFile() || url == m_string);
#else
    ASSERT(url == m_string);
#endif
}

URL::URL(const URL& base, const String& relative)
{
    URLParser parser(relative, base);
    *this = parser.result();
}

URL::URL(const URL& base, const String& relative, const TextEncoding& encoding)
{
    // For UTF-{7,16,32}, we want to use UTF-8 for the query part as
    // we do when submitting a form. A form with GET method
    // has its contents added to a URL as query params and it makes sense
    // to be consistent.
    URLParser parser(relative, base, encoding.encodingForFormSubmission());
    *this = parser.result();
}

static bool shouldTrimFromURL(UChar c)
{
    // Browsers ignore leading/trailing whitespace and control
    // characters from URLs.  Note that c is an *unsigned* char here
    // so this comparison should only catch control characters.
    return c <= ' ';
}

URL URL::isolatedCopy() const
{
    URL result = *this;
    result.m_string = result.m_string.isolatedCopy();
    return result;
}

String URL::lastPathComponent() const
{
    if (!hasPath())
        return String();

    unsigned end = m_pathEnd - 1;
    if (m_string[end] == '/')
        --end;

    size_t start = m_string.reverseFind('/', end);
    if (start < static_cast<unsigned>(m_portEnd))
        return String();
    ++start;

    return m_string.substring(start, end - start + 1);
}

StringView URL::protocol() const
{
    return StringView(m_string).substring(0, m_schemeEnd);
}

String URL::host() const
{
    unsigned start = hostStart();
    return m_string.substring(start, m_hostEnd - start);
}

std::optional<uint16_t> URL::port() const
{
    if (!m_portEnd || m_hostEnd >= m_portEnd - 1)
        return std::nullopt;

    bool ok = false;
    unsigned number;
    if (m_string.is8Bit())
        number = charactersToUIntStrict(m_string.characters8() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
    else
        number = charactersToUIntStrict(m_string.characters16() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
    if (!ok || number > std::numeric_limits<uint16_t>::max())
        return std::nullopt;
    return number;
}

String URL::hostAndPort() const
{
    if (auto port = this->port())
        return host() + ':' + String::number(port.value());
    return host();
}

String URL::protocolHostAndPort() const
{
    String result = m_string.substring(0, m_portEnd);

    if (m_passwordEnd - m_userStart > 0) {
        const int allowForTrailingAtSign = 1;
        result.remove(m_userStart, m_passwordEnd - m_userStart + allowForTrailingAtSign);
    }

    return result;
}

String URL::user() const
{
    return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart));
}

String URL::pass() const
{
    if (m_passwordEnd == m_userEnd)
        return String();

    return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1));
}

String URL::encodedUser() const
{
    return m_string.substring(m_userStart, m_userEnd - m_userStart);
}

String URL::encodedPass() const
{
    if (m_passwordEnd == m_userEnd)
        return String();

    return m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1);
}

String URL::fragmentIdentifier() const
{
    if (!m_isValid || m_queryEnd == m_string.length())
        return String();

    return m_string.substring(m_queryEnd + 1);
}

bool URL::hasFragmentIdentifier() const
{
    return m_isValid && m_string.length() != m_queryEnd;
}

String URL::baseAsString() const
{
    return m_string.left(m_pathAfterLastSlash);
}

#if !USE(CF)

String URL::fileSystemPath() const
{
    if (!isValid() || !isLocalFile())
        return String();

    return decodeURLEscapeSequences(path());
}

#endif

#ifdef NDEBUG

static inline void assertProtocolIsGood(StringView)
{
}

#else

static void assertProtocolIsGood(StringView protocol)
{
    // FIXME: We probably don't need this function any more.
    // The isASCIIAlphaCaselessEqual function asserts that passed-in characters
    // are ones it can handle; the older code did not and relied on these checks.
    for (auto character : protocol.codeUnits()) {
        ASSERT(isASCII(character));
        ASSERT(character > ' ');
        ASSERT(!isASCIIUpper(character));
        ASSERT(toASCIILowerUnchecked(character) == character);
    }
}

#endif

static Lock& defaultPortForProtocolMapForTestingLock()
{
    static NeverDestroyed<Lock> lock;
    return lock;
}

using DefaultPortForProtocolMapForTesting = HashMap<String, uint16_t>;
static DefaultPortForProtocolMapForTesting*& defaultPortForProtocolMapForTesting()
{
    static DefaultPortForProtocolMapForTesting* defaultPortForProtocolMap;
    return defaultPortForProtocolMap;
}

static DefaultPortForProtocolMapForTesting& ensureDefaultPortForProtocolMapForTesting()
{
    DefaultPortForProtocolMapForTesting*& defaultPortForProtocolMap = defaultPortForProtocolMapForTesting();
    if (!defaultPortForProtocolMap)
        defaultPortForProtocolMap = new DefaultPortForProtocolMapForTesting;
    return *defaultPortForProtocolMap;
}

void registerDefaultPortForProtocolForTesting(uint16_t port, const String& protocol)
{
    LockHolder locker(defaultPortForProtocolMapForTestingLock());
    ensureDefaultPortForProtocolMapForTesting().add(protocol, port);
}

void clearDefaultPortForProtocolMapForTesting()
{
    LockHolder locker(defaultPortForProtocolMapForTestingLock());
    if (auto* map = defaultPortForProtocolMapForTesting())
        map->clear();
}

std::optional<uint16_t> defaultPortForProtocol(StringView protocol)
{
    if (auto* overrideMap = defaultPortForProtocolMapForTesting()) {
        LockHolder locker(defaultPortForProtocolMapForTestingLock());
        ASSERT(overrideMap); // No need to null check again here since overrideMap cannot become null after being non-null.
        auto iterator = overrideMap->find(protocol.toStringWithoutCopying());
        if (iterator != overrideMap->end())
            return iterator->value;
    }
    return URLParser::defaultPortForProtocol(protocol);
}

bool isDefaultPortForProtocol(uint16_t port, StringView protocol)
{
    return defaultPortForProtocol(protocol) == port;
}

bool URL::protocolIs(const char* protocol) const
{
    assertProtocolIsGood(StringView(reinterpret_cast<const LChar*>(protocol), strlen(protocol)));

    // JavaScript URLs are "valid" and should be executed even if URL decides they are invalid.
    // The free function protocolIsJavaScript() should be used instead. 
    ASSERT(!equalLettersIgnoringASCIICase(StringView(protocol), "javascript"));

    if (!m_isValid)
        return false;

    // Do the comparison without making a new string object.
    for (unsigned i = 0; i < m_schemeEnd; ++i) {
        if (!protocol[i] || !isASCIIAlphaCaselessEqual(m_string[i], protocol[i]))
            return false;
    }
    return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument.
}

bool URL::protocolIs(StringView protocol) const
{
    assertProtocolIsGood(protocol);

    if (!m_isValid)
        return false;
    
    if (m_schemeEnd != protocol.length())
        return false;

    // Do the comparison without making a new string object.
    for (unsigned i = 0; i < m_schemeEnd; ++i) {
        if (!isASCIIAlphaCaselessEqual(m_string[i], protocol[i]))
            return false;
    }
    return true;
}

String URL::query() const
{
    if (m_queryEnd == m_pathEnd)
        return String();

    return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1)); 
}

String URL::path() const
{
    return m_string.substring(m_portEnd, m_pathEnd - m_portEnd);
}

bool URL::setProtocol(const String& s)
{
    // Firefox and IE remove everything after the first ':'.
    size_t separatorPosition = s.find(':');
    String newProtocol = s.substring(0, separatorPosition);

    if (!isValidProtocol(newProtocol))
        return false;

    if (!m_isValid) {
        URLParser parser(makeString(newProtocol, ":", m_string));
        *this = parser.result();
        return true;
    }

    URLParser parser(makeString(newProtocol, m_string.substring(m_schemeEnd)));
    *this = parser.result();
    return true;
}

static bool containsOnlyASCII(StringView string)
{
    if (string.is8Bit())
        return charactersAreAllASCII(string.characters8(), string.length());
    return charactersAreAllASCII(string.characters16(), string.length());
}
    
// Appends the punycoded hostname identified by the given string and length to
// the output buffer. The result will not be null terminated.
// Return value of false means error in encoding.
static bool appendEncodedHostname(UCharBuffer& buffer, StringView string)
{
    // Needs to be big enough to hold an IDN-encoded name.
    // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
    const unsigned hostnameBufferLength = 2048;
    
    if (string.length() > hostnameBufferLength || containsOnlyASCII(string)) {
        append(buffer, string);
        return true;
    }
    
    UChar hostnameBuffer[hostnameBufferLength];
    UErrorCode error = U_ZERO_ERROR;
    
#if COMPILER(GCC_OR_CLANG)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
    int32_t numCharactersConverted = uidna_IDNToASCII(string.upconvertedCharacters(), string.length(), hostnameBuffer,
        hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error);
#if COMPILER(GCC_OR_CLANG)
#pragma GCC diagnostic pop
#endif
    
    if (error == U_ZERO_ERROR) {
        buffer.append(hostnameBuffer, numCharactersConverted);
        return true;
    }
    return false;
}
    
void URL::setHost(const String& s)
{
    if (!m_isValid)
        return;

    auto colonIndex = s.find(':');
    if (colonIndex != notFound)
        return;

    UCharBuffer encodedHostName;
    if (!appendEncodedHostname(encodedHostName, s))
        return;
    
    bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
    
    StringBuilder builder;
    builder.append(m_string.left(hostStart()));
    if (slashSlashNeeded)
        builder.appendLiteral("//");
    builder.append(StringView(encodedHostName.data(), encodedHostName.size()));
    builder.append(m_string.substring(m_hostEnd));

    URLParser parser(builder.toString());
    *this = parser.result();
}

void URL::removePort()
{
    if (m_hostEnd == m_portEnd)
        return;
    URLParser parser(m_string.left(m_hostEnd) + m_string.substring(m_portEnd));
    *this = parser.result();
}

void URL::setPort(unsigned short i)
{
    if (!m_isValid)
        return;

    bool colonNeeded = m_portEnd == m_hostEnd;
    unsigned portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1);

    URLParser parser(makeString(m_string.left(portStart), (colonNeeded ? ":" : ""), String::number(i), m_string.substring(m_portEnd)));
    *this = parser.result();
}

void URL::setHostAndPort(const String& hostAndPort)
{
    if (!m_isValid)
        return;

    StringView hostName(hostAndPort);
    StringView port;
    
    auto colonIndex = hostName.find(':');
    if (colonIndex != notFound) {
        port = hostName.substring(colonIndex + 1);
        bool ok;
        int portInt = port.toIntStrict(ok);
        if (!ok || portInt < 0)
            return;
        hostName = hostName.substring(0, colonIndex);
    }

    if (hostName.isEmpty())
        return;

    UCharBuffer encodedHostName;
    if (!appendEncodedHostname(encodedHostName, hostName))
        return;

    bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;

    StringBuilder builder;
    builder.append(m_string.left(hostStart()));
    if (slashSlashNeeded)
        builder.appendLiteral("//");
    builder.append(StringView(encodedHostName.data(), encodedHostName.size()));
    if (!port.isEmpty()) {
        builder.appendLiteral(":");
        builder.append(port);
    }
    builder.append(m_string.substring(m_portEnd));

    URLParser parser(builder.toString());
    *this = parser.result();
}

void URL::setUser(const String& user)
{
    if (!m_isValid)
        return;

    // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
    // and to avoid changing more than just the user login.

    unsigned end = m_userEnd;
    if (!user.isEmpty()) {
        String u = encodeWithURLEscapeSequences(user, PercentEncodeUsername);
        if (m_userStart == m_schemeEnd + 1)
            u = "//" + u;
        // Add '@' if we didn't have one before.
        if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@'))
            u.append('@');
        URLParser parser(makeString(m_string.left(m_userStart), u, m_string.substring(end)));
        *this = parser.result();
    } else {
        // Remove '@' if we now have neither user nor password.
        if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@')
            end += 1;
        // We don't want to parse in the extremely common case where we are not going to make a change.
        if (m_userStart != end) {
            URLParser parser(makeString(m_string.left(m_userStart), m_string.substring(end)));
            *this = parser.result();
        }
    }
}

void URL::setPass(const String& password)
{
    if (!m_isValid)
        return;

    unsigned end = m_passwordEnd;
    if (!password.isEmpty()) {
        String p = ":" + encodeWithURLEscapeSequences(password, PercentEncodePassword) + "@";
        if (m_userEnd == m_schemeEnd + 1)
            p = "//" + p;
        // Eat the existing '@' since we are going to add our own.
        if (end != m_hostEnd && m_string[end] == '@')
            end += 1;
        URLParser parser(makeString(m_string.left(m_userEnd), p, m_string.substring(end)));
        *this = parser.result();
    } else {
        // Remove '@' if we now have neither user nor password.
        if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@')
            end += 1;
        // We don't want to parse in the extremely common case where we are not going to make a change.
        if (m_userEnd != end) {
            URLParser parser(makeString(m_string.left(m_userEnd), m_string.substring(end)));
            *this = parser.result();
        }
    }
}

void URL::setFragmentIdentifier(StringView identifier)
{
    if (!m_isValid)
        return;

    // FIXME: Optimize the case where the identifier already happens to be equal to what was passed?
    // FIXME: Is it correct to do this without encoding and escaping non-ASCII characters?
    *this = URLParser { makeString(StringView { m_string }.substring(0, m_queryEnd), '#', identifier) }.result();
}

void URL::removeFragmentIdentifier()
{
    if (!m_isValid) {
        ASSERT(!m_queryEnd);
        return;
    }
    if (m_isValid && m_string.length() > m_queryEnd)
        m_string = m_string.left(m_queryEnd);
}
    
void URL::setQuery(const String& query)
{
    if (!m_isValid)
        return;

    // FIXME: '#' and non-ASCII characters must be encoded and escaped.
    // Usually, the query is encoded using document encoding, not UTF-8, but we don't have
    // access to the document in this function.
    // https://webkit.org/b/161176
    if ((query.isEmpty() || query[0] != '?') && !query.isNull()) {
        URLParser parser(makeString(m_string.left(m_pathEnd), "?", query, m_string.substring(m_queryEnd)));
        *this = parser.result();
    } else {
        URLParser parser(makeString(m_string.left(m_pathEnd), query, m_string.substring(m_queryEnd)));
        *this = parser.result();
    }

}

void URL::setPath(const String& s)
{
    if (!m_isValid)
        return;

    // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts
    // may be inadvertently affected.
    String path = s;
    if (path.isEmpty() || path[0] != '/')
        path = "/" + path;

    URLParser parser(makeString(m_string.left(m_portEnd), encodeWithURLEscapeSequences(path), m_string.substring(m_pathEnd)));
    *this = parser.result();
}

String decodeURLEscapeSequences(const String& string)
{
    return decodeEscapeSequences<URLEscapeSequence>(string, UTF8Encoding());
}

String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding)
{
    return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
}

// Caution: This function does not bounds check.
static void appendEscapedChar(char*& buffer, unsigned char c)
{
    *buffer++ = '%';
    placeByteAsHex(c, buffer);
}

String URL::serialize(bool omitFragment) const
{
    if (omitFragment)
        return m_string.left(m_queryEnd);
    return m_string;
}

#if PLATFORM(IOS)

static bool shouldCanonicalizeScheme = true;

void enableURLSchemeCanonicalization(bool enableSchemeCanonicalization)
{
    shouldCanonicalizeScheme = enableSchemeCanonicalization;
}

#endif

template<size_t length>
static inline bool equal(const char* a, const char (&b)[length])
{
#if PLATFORM(IOS)
    if (!shouldCanonicalizeScheme) {
        for (size_t i = 0; i < length; ++i) {
            if (toASCIILower(a[i]) != b[i])
                return false;
        }
        return true;
    }
#endif
    for (size_t i = 0; i < length; ++i) {
        if (a[i] != b[i])
            return false;
    }
    return true;
}

template<size_t lengthB>
static inline bool equal(const char* stringA, size_t lengthA, const char (&stringB)[lengthB])
{
    return lengthA == lengthB && equal(stringA, stringB);
}

bool equalIgnoringFragmentIdentifier(const URL& a, const URL& b)
{
    if (a.m_queryEnd != b.m_queryEnd)
        return false;
    unsigned queryLength = a.m_queryEnd;
    for (unsigned i = 0; i < queryLength; ++i)
        if (a.string()[i] != b.string()[i])
            return false;
    return true;
}

bool protocolHostAndPortAreEqual(const URL& a, const URL& b)
{
    if (a.m_schemeEnd != b.m_schemeEnd)
        return false;

    unsigned hostStartA = a.hostStart();
    unsigned hostLengthA = a.hostEnd() - hostStartA;
    unsigned hostStartB = b.hostStart();
    unsigned hostLengthB = b.hostEnd() - b.hostStart();
    if (hostLengthA != hostLengthB)
        return false;

    // Check the scheme
    for (unsigned i = 0; i < a.m_schemeEnd; ++i) {
        if (a.string()[i] != b.string()[i])
            return false;
    }

    // And the host
    for (unsigned i = 0; i < hostLengthA; ++i) {
        if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
            return false;
    }

    if (a.port() != b.port())
        return false;

    return true;
}

bool hostsAreEqual(const URL& a, const URL& b)
{
    unsigned hostStartA = a.hostStart();
    unsigned hostLengthA = a.hostEnd() - hostStartA;
    unsigned hostStartB = b.hostStart();
    unsigned hostLengthB = b.hostEnd() - hostStartB;
    if (hostLengthA != hostLengthB)
        return false;

    for (unsigned i = 0; i < hostLengthA; ++i) {
        if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
            return false;
    }

    return true;
}

String encodeWithURLEscapeSequences(const String& notEncodedString, PercentEncodeCharacterClass whatToEncode)
{
    CString asUTF8 = notEncodedString.utf8();

    CharBuffer buffer(asUTF8.length() * 3 + 1);
    char* p = buffer.data();

    const char* str = asUTF8.data();
    const char* strEnd = str + asUTF8.length();
    while (str < strEnd) {
        unsigned char c = *str++;
        if (percentEncodeClassTable[c] >= whatToEncode)
            appendEscapedChar(p, c);
        else
            *p++ = c;
    }

    ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));

    return String(buffer.data(), p - buffer.data());
}

String encodeWithURLEscapeSequences(const String& notEncodedString)
{
    CString asUTF8 = notEncodedString.utf8();

    CharBuffer buffer(asUTF8.length() * 3 + 1);
    char* p = buffer.data();

    const char* str = asUTF8.data();
    const char* strEnd = str + asUTF8.length();
    while (str < strEnd) {
        unsigned char c = *str++;
        if (isBadChar(c))
            appendEscapedChar(p, c);
        else
            *p++ = c;
    }

    ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));

    return String(buffer.data(), p - buffer.data());
}

bool URL::isHierarchical() const
{
    if (!m_isValid)
        return false;
    ASSERT(m_string[m_schemeEnd] == ':');
    return m_string[m_schemeEnd + 1] == '/';
}

void URL::copyToBuffer(Vector<char, 512>& buffer) const
{
    // FIXME: This throws away the high bytes of all the characters in the string!
    // That's fine for a valid URL, which is all ASCII, but not for invalid URLs.
    buffer.resize(m_string.length());
    copyASCII(m_string, buffer.data());
}

template<typename StringClass>
bool protocolIsInternal(const StringClass& url, const char* protocol)
{
    // Do the comparison without making a new string object.
    assertProtocolIsGood(StringView(reinterpret_cast<const LChar*>(protocol), strlen(protocol)));
    bool isLeading = true;
    for (unsigned i = 0, j = 0; url[i]; ++i) {
        // Skip leading whitespace and control characters.
        if (isLeading && shouldTrimFromURL(url[i]))
            continue;
        isLeading = false;

        // Skip any tabs and newlines.
        if (isTabNewline(url[i]))
            continue;

        if (!protocol[j])
            return url[i] == ':';
        if (!isASCIIAlphaCaselessEqual(url[i], protocol[j]))
            return false;

        ++j;
    }
    
    return false;
}

bool protocolIs(const String& url, const char* protocol)
{
    return protocolIsInternal(url, protocol);
}

inline bool URL::protocolIs(const String& string, const char* protocol)
{
    return WebCore::protocolIsInternal(string, protocol);
}

bool isValidProtocol(const String& protocol)
{
    // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
    if (protocol.isEmpty())
        return false;
    if (!isSchemeFirstChar(protocol[0]))
        return false;
    unsigned protocolLength = protocol.length();
    for (unsigned i = 1; i < protocolLength; i++) {
        if (!isSchemeChar(protocol[i]))
            return false;
    }
    return true;
}

#ifndef NDEBUG

void URL::print() const
{
    printf("%s\n", m_string.utf8().data());
}

#endif

String URL::strippedForUseAsReferrer() const
{
    URL referrer(*this);
    referrer.setUser(String());
    referrer.setPass(String());
    referrer.removeFragmentIdentifier();
    return referrer.string();
}

bool URL::isLocalFile() const
{
    // Including feed here might be a bad idea since drag and drop uses this check
    // and including feed would allow feeds to potentially let someone's blog
    // read the contents of the clipboard on a drag, even without a drop.
    // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
    return protocolIs("file");
}

bool protocolIsJavaScript(const String& url)
{
    return protocolIsInternal(url, "javascript");
}

bool protocolIsJavaScript(StringView url)
{
    return protocolIsInternal(url, "javascript");
}

bool protocolIsInHTTPFamily(const String& url)
{
    // Do the comparison without making a new string object.
    return isASCIIAlphaCaselessEqual(url[0], 'h')
        && isASCIIAlphaCaselessEqual(url[1], 't')
        && isASCIIAlphaCaselessEqual(url[2], 't')
        && isASCIIAlphaCaselessEqual(url[3], 'p')
        && (url[4] == ':' || (isASCIIAlphaCaselessEqual(url[4], 's') && url[5] == ':'));
}

const URL& blankURL()
{
    static NeverDestroyed<URL> staticBlankURL(ParsedURLString, "about:blank");
    return staticBlankURL;
}

bool URL::isBlankURL() const
{
    return protocolIs("about");
}

bool portAllowed(const URL& url)
{
    std::optional<uint16_t> port = url.port();

    // Since most URLs don't have a port, return early for the "no port" case.
    if (!port)
        return true;

    // This blocked port list matches the port blocking that Mozilla implements.
    // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information.
    static const uint16_t blockedPortList[] = {
        1,    // tcpmux
        7,    // echo
        9,    // discard
        11,   // systat
        13,   // daytime
        15,   // netstat
        17,   // qotd
        19,   // chargen
        20,   // FTP-data
        21,   // FTP-control
        22,   // SSH
        23,   // telnet
        25,   // SMTP
        37,   // time
        42,   // name
        43,   // nicname
        53,   // domain
        77,   // priv-rjs
        79,   // finger
        87,   // ttylink
        95,   // supdup
        101,  // hostriame
        102,  // iso-tsap
        103,  // gppitnp
        104,  // acr-nema
        109,  // POP2
        110,  // POP3
        111,  // sunrpc
        113,  // auth
        115,  // SFTP
        117,  // uucp-path
        119,  // nntp
        123,  // NTP
        135,  // loc-srv / epmap
        139,  // netbios
        143,  // IMAP2
        179,  // BGP
        389,  // LDAP
        465,  // SMTP+SSL
        512,  // print / exec
        513,  // login
        514,  // shell
        515,  // printer
        526,  // tempo
        530,  // courier
        531,  // Chat
        532,  // netnews
        540,  // UUCP
        556,  // remotefs
        563,  // NNTP+SSL
        587,  // ESMTP
        601,  // syslog-conn
        636,  // LDAP+SSL
        993,  // IMAP+SSL
        995,  // POP3+SSL
        2049, // NFS
        3659, // apple-sasl / PasswordServer [Apple addition]
        4045, // lockd
        4190, // ManageSieve [Apple addition]
        6000, // X11
        6665, // Alternate IRC [Apple addition]
        6666, // Alternate IRC [Apple addition]
        6667, // Standard IRC [Apple addition]
        6668, // Alternate IRC [Apple addition]
        6669, // Alternate IRC [Apple addition]
        invalidPortNumber, // Used to block all invalid port numbers
    };

    // If the port is not in the blocked port list, allow it.
    ASSERT(std::is_sorted(std::begin(blockedPortList), std::end(blockedPortList)));
    if (!std::binary_search(std::begin(blockedPortList), std::end(blockedPortList), port.value()))
        return true;

    // Allow ports 21 and 22 for FTP URLs, as Mozilla does.
    if ((port.value() == 21 || port.value() == 22) && url.protocolIs("ftp"))
        return true;

    // Allow any port number in a file URL, since the port number is ignored.
    if (url.protocolIs("file"))
        return true;

    return false;
}

String mimeTypeFromDataURL(const String& url)
{
    ASSERT(protocolIsInternal(url, "data"));

    // FIXME: What's the right behavior when the URL has a comma first, but a semicolon later?
    // Currently this code will break at the semicolon in that case. Not sure that's correct.
    auto index = url.find(';', 5);
    if (index == notFound)
        index = url.find(',', 5);
    if (index == notFound) {
        // FIXME: There was an old comment here that made it sound like this should be returning text/plain.
        // But we have been returning empty string here for some time, so not changing its behavior at this time.
        return emptyString();
    }
    if (index == 5)
        return ASCIILiteral("text/plain");
    ASSERT(index >= 5);
    return url.substring(5, index - 5).convertToASCIILowercase();
}

String mimeTypeFromURL(const URL& url)
{
    String decodedPath = decodeURLEscapeSequences(url.path());
    String extension = decodedPath.substring(decodedPath.reverseFind('.') + 1);

    // We don't use MIMETypeRegistry::getMIMETypeForPath() because it returns "application/octet-stream" upon failure
    return MIMETypeRegistry::getMIMETypeForExtension(extension);
}

String URL::stringCenterEllipsizedToLength(unsigned length) const
{
    if (string().length() <= length)
        return string();

    return string().left(length / 2 - 1) + "..." + string().right(length / 2 - 2);
}

URL URL::fakeURLWithRelativePart(const String& relativePart)
{
    return URL(URL(), "webkit-fake-url://" + createCanonicalUUIDString() + '/' + relativePart);
}

URL URL::fileURLWithFileSystemPath(const String& filePath)
{
    return URL(URL(), "file:///" + filePath);
}

}