http.py [plain text]

# This library is free software; you can redistribute it and/or
# modify it under the terms of version 2.1 of the GNU Lesser General Public
# License as published by the Free Software Foundation.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""HyperText Transfer Protocol implementation.

The second coming.

Future Plans:
 - HTTP client support will at some point be refactored to support HTTP/1.1.
 - Accept chunked data from clients in server.
 - Other missing HTTP features from the RFC.

Maintainer: U{James Y Knight <mailto:foom@fuhm.net>}
"""

# system imports
from cStringIO import StringIO
import tempfile
import base64, binascii
import cgi
import socket
import math
import time
import calendar
import warnings
import os

# twisted imports
from twisted.internet import interfaces, reactor, protocol, address
from twisted.protocols import policies, basic
from twisted.python import log, components
try: # try importing the fast, C version
    from twisted.protocols._c_urlarg import unquote
except ImportError:
    from urllib import unquote

# sibling imports
import responsecode


protocol_version = "HTTP/1.1"


def parse_qs(qs, keep_blank_values=0, strict_parsing=0, unquote=unquote):
    """like cgi.parse_qs, only with custom unquote function"""
    d = {}
    items = [s2 for s1 in qs.split("&") for s2 in s1.split(";")]
    for item in items:
        try:
            k, v = item.split("=", 1)
        except ValueError:
            if strict_parsing:
                raise
            continue
        if v or keep_blank_values:
            k = unquote(k.replace("+", " "))
            v = unquote(v.replace("+", " "))
            if k in d:
                d[k].append(v)
            else:
                d[k] = [v]
    return d

def toChunk(data):
    """Convert string to a chunk.
    
    @returns: a tuple of strings representing the chunked encoding of data"""
    return ("%x\r\n" % len(data), data, "\r\n")
    
def fromChunk(data):
    """Convert chunk to string.

    @returns: tuple (result, remaining), may raise ValueError.
    """
    prefix, rest = data.split('\r\n', 1)
    length = int(prefix, 16)
    if not rest[length:length+2] == '\r\n':
        raise ValueError, "chunk must end with CRLF"
    return rest[:length], rest[length+2:]




class StringTransport:
    """
    I am a StringIO wrapper that conforms for the transport API. I support
    the 'writeSequence' method.
    """
    def __init__(self):
        self.s = StringIO()
    def writeSequence(self, seq):
        self.s.write(''.join(seq))
    def __getattr__(self, attr):
        return getattr(self.__dict__['s'], attr)

# response codes that must have empty bodies
NO_BODY_CODES = (204, 304)

class Request:
    """A HTTP request.

    Subclasses should override the process() method to determine how
    the request will be processed.
    
    @ivar method: The HTTP method that was used.
    @ivar uri: The full URI that was requested (includes arguments).
    @ivar path: The path only (arguments not included).
    @ivar args: All of the arguments, including URL and POST arguments.
    @type args: A mapping of strings (the argument names) to lists of values.
                i.e., ?foo=bar&foo=baz&quux=spam results in
                {'foo': ['bar', 'baz'], 'quux': ['spam']}.
    @ivar received_headers: All received headers
    """

    __implements__ = interfaces.IConsumer,

    producer = None
    finished = 0
    code = responsecode.OK
    code_message = responsecode.RESPONSES[responsecode.OK]
    startedWriting = 0
    chunked = 0
    sentLength = 0 # content-length of response, or total bytes sent via chunking

    _foreceSSL = False
    
    
    def __init__(self, channel, queued, command, path, version, raw_in_headers):
        """
        @param channel: the channel we're connected to.
        @param queued: are we in the request queue, or can we start writing to
            the transport?
        """
        self.channel = channel
        self.queued = queued
        self.method = command
        self.uri = path
        self.clientproto = version

        self.out_headers = http_headers.ResponseHeaders()
        self.in_headers = http_headers.RequestHeaders(raw_in_headers)

        if queued:
            self.transport = StringTransport()
        else:
            self.transport = self.channel.transport

    def _cleanup(self):
        """Called when have finished responding and are no longer queued."""
        if self.producer:
            log.err(RuntimeError("Producer was not unregistered for %s" % self.uri))
            self.unregisterProducer()
        self.channel.requestDone(self)
        del self.channel

    # methods for channel - end users should not use these

    def noLongerQueued(self):
        """Notify the object that it is no longer queued.

        We start writing whatever data we have to the transport, etc.

        This method is not intended for users.
        """
        if not self.queued:
            raise RuntimeError, "noLongerQueued() got called unnecessarily."

        self.queued = 0

        # set transport to real one and send any buffer data
        data = self.transport.getvalue()
        self.transport = self.channel.transport
        if data:
            self.transport.write(data)

        # if we have producer, register it with transport
        if (self.producer is not None) and not self.finished:
            self.transport.registerProducer(self.producer, True)

        # if we're finished, clean up
        if self.finished:
            self._cleanup()

    def handleContentChunk(self, data):
        """Called by channel when a piece of data has been received.

        Should be overridden by a subclass to do something appropriate."""
        pass
    
    def handleContentComplete(self):
        """Called by channel when all data has been received.

        This method is not intended for users.
        """
        self.args = {}
        self.stack = []

        x = self.uri.split('?')

        if len(x) == 1:
            self.path = self.uri
        else:
            if len(x) != 2:
                log.msg("May ignore parts of this invalid URI: %s"
                        % repr(self.uri))
            self.path, argstring = x[0], x[1]
            self.args = parse_qs(argstring, 1)

        # cache the client and server information, we'll need this later to be
        # serialized and sent with the request so CGIs will work remotely
        self.client = self.channel.transport.getPeer()
        self.host = self.channel.transport.getHost()

    def __repr__(self):
        return '<%s %s %s>'% (self.method, self.uri, self.clientproto)

    # consumer interface

    def registerProducer(self, producer, streaming=True):
        """Register a producer.
        Only streaming (push) producers supported."""

        if not streaming:
            raise ValueError, "non-streaming (pull) producers not supported."
        
        if self.producer:
            raise ValueError, "registering producer %s before previous one (%s) was unregistered" % (producer, self.producer)
        
        self.producer = producer
        
        if self.queued:
            producer.pauseProducing()
        else:
            self.transport.registerProducer(producer, True)

    def unregisterProducer(self):
        """Unregister the producer."""
        if not self.queued:        
            self.transport.unregisterProducer()
        self.producer = None

    # private http response methods

    def _sendError(self, code, resp=''):
        self.transport.write('%s %s %s\r\n\r\n' % (self.clientproto, code, resp))
    
    # The following is the public interface that people should be
    # writing to.

    def finish(self):
        """We are finished writing data."""
        if self.finished:
            warnings.warn("Warning! request.finish called twice.", stacklevel=2)
            return

        if not self.startedWriting:
            # write headers
            self.write('')

        # debug code just in case I made a mistake
        if self.chunked and self.code in NO_BODY_CODES:
            raise RuntimeError, "we screwed up"

        if self.chunked:
            # write last chunk and closing CRLF
            self.transport.write("0\r\n\r\n")
        
        # log request
        if hasattr(self.channel, "factory"):
            self.channel.factory.log(self)

        self.finished = 1
        if not self.queued:
            self._cleanup()

    def _initialWrite(self):
        self.startedWriting = 1
        version = self.clientproto
        l = []
        l.append('%s %s %s\r\n' % (version, self.code,
                                   self.code_message))
        # if we don't have a content length, we send data in
        # chunked mode, so that we can support pipelining in
        # persistent connections.
        if ((version == "HTTP/1.1") and
            (self.headers.get('content-length', None) is None) and
            (self.code not in NO_BODY_CODES)):
            l.append("%s: %s\r\n" % ('Transfer-encoding', 'chunked'))
            self.chunked = 1
        for name, value in self.headers.items():
            l.append("%s: %s\r\n" % (name.capitalize(), value))
        l.append("\r\n")

        self.transport.writeSequence(l)

        # if this is a "HEAD" request, we shouldn't return any data
        if self.method == "HEAD":
            self.write = lambda data: None
            return False

        # for certain result codes, we should never return any data
        if self.code in NO_BODY_CODES:
            self.write = lambda data: None
            return False

        return True

    def write(self, data):
        """
        Write some data as a result of an HTTP request.  The first
        time this is called, it writes out response data.
        """
        if not self.startedWriting:
            if not self._initialWrite():
                return
        self.sentLength = self.sentLength + len(data)
        if data:
            if self.chunked:
                self.transport.writeSequence(toChunk(data))
            else:
                self.transport.write(data)

    # FIXME: usefulize this
    def writeFile(self, file):
        """
        Write data from a file, possibly more efficiently than write(data)
        would do. Otherwise identical to write(file.read()).
v        """
        self.write(file.read())
        
    def setResponseCode(self, code, message=None):
        """Set the HTTP response code.
        """
        self.code = code
        if message:
            self.code_message = message
        else:
            self.code_message = RESPONSES.get(code, "Unknown Status")

    def setHeader(self, k, v):
        """Set an outgoing HTTP header.
        """
        self.headers[k.lower()] = v

    def redirect(self, url):
        """Utility function that does a redirect.

        The request should have finish() called after this.
        """
        self.setResponseCode(FOUND)
        self.setHeader("location", url)
    
    def setLastModified(self, when):
        """Set the X{Last-Modified} time for the response to this request.

        If I am called more than once, I ignore attempts to set
        Last-Modified earlier, only replacing the Last-Modified time
        if it is to a later value.

        @param when: The last time the resource being returned was
            modified, in seconds since the epoch.
        @type when: number
        """
        # time.time() may be a float, but the HTTP-date strings are
        # only good for whole seconds.
        when = long(math.ceil(when))
        lastModified = self.getRespHeader('Last-Modified')
        if not lastModified or (lastModified < when):
            self.setRespHeader('Last-Modified', when)
        
    def checkBody(self):
        """Check to see if this request should have a body. As a side-effect
        may modify my response code to L{NOT_MODIFIED} or L{PRECONDITION_FAILED},
        if appropriate.
        
        Call this function after setting the ETag and Last-Modified
        output headers, but before actually proceeding with request
        processing.
        
        This examines the appropriate request headers for conditionals,
        the existing response headers and sets the response code as necessary.
        
        @return: True if you should write a body, False if you should
                 not.
        """
        tags = self.getReqHeader("if-none-match")
        etag = self.getRespHeader("etag")
        if tags:
            if (etag in tags) or ('*' in tags):
                self.setResponseCode(((self.method in ("HEAD", "GET"))
                                      and NOT_MODIFIED)
                                     or PRECONDITION_FAILED)
                return False

        modified_since = self.getReqHeader('if-modified-since')
        if modified_since:
            if modified_since >= self.lastModified:
                self.setResponseCode(NOT_MODIFIED)
                return False

        # if this is a "HEAD" request, we shouldn't return any data
        if self.method == "HEAD":
            return False
        
        return True
        
    def getRequestHostname(self):
        """Get the hostname that the user passed in to the request.

        This will either use the Host: header (if it is available) or the
        host we are listening on if the header is unavailable.
        """
        return (self.getReqHeader('host') or
                socket.gethostbyaddr(self.getHost()[1])[0]
                ).split(':')[0]

    def getHost(self):
        """Get my originally requesting transport's host.

        Don't rely on the 'transport' attribute, since Request objects may be
        copied remotely.  For information on this method's return value, see
        twisted.internet.tcp.Port.
        """
        return self.host

    def setHost(self, host, port, ssl=0):
        """Change the host and port the request thinks it's using.

        This method is useful for working with reverse HTTP proxies (e.g.
        both Squid and Apache's mod_proxy can do this), when the address
        the HTTP client is using is different than the one we're listening on.

        For example, Apache may be listening on https://www.example.com, and then
        forwarding requests to http://localhost:8080, but we don't want HTML produced
        by Twisted to say 'http://localhost:8080', they should say 'https://www.example.com',
        so we do::

           request.setHost('www.example.com', 443, ssl=1)

        This method is experimental.
        """
        self._forceSSL = ssl
        self.received_headers["host"] = host
        self.host = address.IPv4Address("TCP", host, port)

    def getClientIP(self):
        if isinstance(self.client, address.IPv4Address):
            return self.client.host
        else:
            return None

    def isSecure(self):
        return self._forceSSL or components.implements(self.channel.transport, interfaces.ISSLTransport)

    def _authorize(self):
        # Authorization, (mostly) per the RFC
        try:
            authh = self.getReqHeaderRaw("Authorization")
            if not authh:
                self.user = self.password = ''
                return
                
            bas, upw = authh.split()
            if bas.lower() != "basic":
                raise ValueError
            upw = base64.decodestring(upw)
            self.user, self.password = upw.split(':', 1)
        except (binascii.Error, ValueError):
            self.user = self.password = ""
        except:
            log.err()
            self.user = self.password = ""
    
    def getUser(self):
        try:
            return self.user
        except:
            pass
        self._authorize()
        return self.user

    def getPassword(self):
        try:
            return self.password
        except:
            pass
        self._authorize()
        return self.password

    def connectionLost(self, reason):
        """connection was lost"""
        pass


class HTTPChannel(basic.LineReceiver, policies.TimeoutMixin):
    """A receiver for HTTP requests. Handles the hop-by-hop behavior."""

    # set in instances or subclasses
    maxHeaderLength = 10240 # maximum length of headers (10KiB)
    requestFactory = Request
    
    
    _partialheader = ''
    _first_line = 1
    _headerlen = 0
    _savedTimeOut = None

    def __init__(self):
        self.reqHeaders = {}
        # the request queue
        self.requests = []
        
    def connectionMade(self):
        self.setTimeout(self.timeOut)
    
    def lineReceived(self, line):
        self.resetTimeout()

        if self._first_line:
            # if this connection is not persistent, drop any data which
            # the client (illegally) sent after the last request.
            if not self.persistent:
                self.dataReceived = self.lineReceived = lambda *args: None
                return

            # IE sends an extraneous empty line (\r\n) after a POST request;
            # eat up such a line, but only ONCE
            if not line and self._first_line == 1:
                self._first_line = 2
                return

            self._first_line = 0
            parts = line.split()
            if len(parts) != 3:
                self.transport.write("HTTP/1.1 400 Bad Request\r\n\r\n")
                self.transport.loseConnection()
                return
            self._command, self._path, self._version = parts
        elif line == '':
            if self._partialheader:
                self.headerReceived(self._partialheader)
            self._partialheader = ''
            self.allHeadersReceived()
            if self.length == 0:
                self.allContentReceived()
            else:
                self.setRawMode()
        elif line[0] in ' \t':
            self._partialheader = self._partialheader+line
        else:
            if self._partialheader:
                self.headerReceived(self._partialheader)
            self._partialheader = line
    
    def headerReceived(self, line):
        """Store this header away. Check for too much header data
           (> maxHeaderLength) and abort the connection if so.
        """
        name,val = line.split(':', 1)
        val.lstrip(' \t')
        old = self._reqHeaders.get(name, None)
        if old is None:
            old = []
            self._reqHeaders[name]=old
        old.append(val)
        
        self._headerlen = self._headerlen+ len(line)
        
        if self._headerlen > self.maxHeaderLength:
            self.transport.write("HTTP/1.1 400 Bad Request\r\n\r\n")
            self.transport.loseConnection()

    def allHeadersReceived(self):
        # set connection variables to 
        self.length = 0
        self.persistent = False

        # Split off connection-related headers
        connHeaders = self.splitConnectionHeaders()
        # create a new Request object
        request = self.requestFactory(self, len(self.requests), self._command, self._path, self._version, self._reqHeaders)
        self.requests.append(request)

        # Reset header state variables
        del self._command, self._path, self._version
        del self._headerlen
        
        self._reqHeaders = {}
        
        self.persistent = self.checkPersistence(request)
        self.length=req.getReqHeader('Content-Length')

    def allContentReceived(self):
        # reset state variables, so we don't interfere with next request
        self.length = 0
        self._first_line = 1
        
        # Disable the idle timeout, in case this request takes a long
        # time to finish generating output.
        if self.timeOut:
            self._savedTimeOut = self.setTimeout(None)

        req = self.requests[-1]
        req.handleContentComplete()

    def rawDataReceived(self, data):
        if len(data) < self.length:
            self.requests[-1].handleContentChunk(data)
            self.length = self.length - len(data)
        else:
            self.requests[-1].handleContentChunk(data[:self.length])
            extraneous = data[self.length:]
            self.allContentReceived()
            self.setLineMode(extraneous)

    def requestDone(self, request):
        """Called by first request in queue when it is done."""
        if request != self.requests[0]: raise TypeError
        del self.requests[0]

        if self.persistent:
            # notify next request it can start writing
            if self.requests:
                self.requests[0].noLongerQueued()
            else:
                if self._savedTimeOut:
                    self.setTimeout(self._savedTimeOut)
        else:
            self.transport.loseConnection()
    
    def timeoutConnection(self):
        log.msg("Timing out client: %s" % str(self.transport.getPeer()))
        policies.TimeoutMixin.timeoutConnection(self)

    def connectionLost(self, reason):
        self.setTimeout(None)
        for request in self.requests:
            request.connectionLost(reason)

    def splitConnectionHeaders(self):
        # Split off headers for the connection from headers for the request.
        
        def move(name):
            h = reqHeaders.getRawHeader(name, None)
            if h is not None:
                reqHeaders.removeHeader(name)
                connHeaders.setRawHeader(name, h)
        
        connHeaderNames = ['Connection', 'Keep-Alive', 'Proxy-Authenticate', 'Proxy-Authorization', 'TE', 'Trailers', 'Transfer-Encoding', 'Upgrade']
        reqHeaders = self._reqHeaders
        connHeaders = http_headers.Headers()
        
        move('Connection')
        if connHeaders.hasHeader('Connection'):
            if self._version != "1.1":
                # Remove all headers mentioned in Connection, because a HTTP 1.0
                # proxy might have erroneously forwarded it from a 1.1 client.
                for name in connHeaders.getHeader('Connection'):
                    if reqHeaders.hasHeader(name):
                        reqHeaders.removeHeader(name)
            else:
                # Otherwise, just add the headers listed to the list of those to move
                connHeaderNames.extend(connHeaders.getHeader('Connection'))
        
        for headername in connHeaders:
            move(headername)
        
        # Content-Length is a both a connection header (defining length of
        # transmission, and a content header (defining length of content).
        h = reqHeaders.getRawHeader('Content-Length', None)
        if h is not None:
            connHeaders.setRawHeader('Content-Length', h)
        
        return connHeaders
        
    def checkPersistence(self, request):
        """Check if the channel should close or not."""
        
        # HTTP 1.0 persistent connection support is unimplemented:
        # we need a way to disable pipelining. HTTP 1.0 can't do
        # pipelining since we can't know in advance if we'll have a
        # outgoing content-length header. If we don't have the header
        # we need to close the connection. In HTTP 1.1 this is not an
        # issue since we use chunked encoding if content-length is
        # not available.

        # Also, who really cares about extra features for HTTP/1.0; nearly
        # everything supports 1.1 these days, so as long as 1.0 *works*, that's
        # fine. (Hrm just noticed, Squid only supports HTTP 1.0 so far, so this
        # might be an issue worth thinking about after all)
        
        if self.version == "HTTP/1.1":
            if 'close' in self.getReqHeader('connection'):
                self.addRespHeader('connection', 'close')
                return 0
            else:
                return 1
        else:
            return 0



class HTTPFactory(protocol.ServerFactory):
    """Factory for HTTP server."""

    protocol = HTTPChannel

    logPath = None
    
    timeOut = 60 * 60 * 12

    def __init__(self, logPath=None, timeout=60*60*12):
        if logPath is not None:
            logPath = os.path.abspath(logPath)
        self.logPath = logPath
        self.timeOut = timeout

    def buildProtocol(self, addr):
        p = protocol.ServerFactory.buildProtocol(self, addr)
        # timeOut needs to be on the Protocol instance cause
        # TimeoutMixin expects it there
        p.timeOut = self.timeOut
        return p

    def startFactory(self):
        _logDateTimeStart()
        if self.logPath:
            self.logFile = self._openLogFile(self.logPath)
        else:
            self.logFile = log.logfile

    def stopFactory(self):
        if hasattr(self, "logFile"):
            if self.logFile != log.logfile:
                self.logFile.close()
            del self.logFile
        _logDateTimeStop()

    def _openLogFile(self, path):
        """Override in subclasses, e.g. to use twisted.python.logfile."""
        f = open(path, "a", 1)
        f.seek(2, 0)
        return f

    def log(self, request):
        """Log a request's result to the logfile, by default in combined log format."""
        line = '%s - - %s "%s" %d %s "%s" "%s"\n' % (
            request.getClientIP(),
            # request.getUser() or "-", # the remote user is almost never important
            _logDateTime,
            '%s %s %s' % (request.method, request.uri, request.clientproto),
            request.code,
            request.sentLength or "-",
            request.getReqHeader("referer") or "-",
            request.getReqHeader("user-agent") or "-")
        self.logFile.write(line)




#     def gotLength(self, length):
#         """Called when HTTP channel got length of content in this request.

#         This method is not intended for users.
#         """
#         if length < 100000:
#             self.content = StringIO()
#         else:
#             self.content = tempfile.TemporaryFile()

#     def handleContentChunk(self, data):
#         """Write a chunk of data.

#         This method is not intended for users.
#         """
#         self.content.write(data)


#         # Argument processing
#         args = self.args
#         ctype = self.getHeader('content-type')
#         if self.method == "POST" and ctype:
#             mfd = 'multipart/form-data'
#             key, pdict = cgi.parse_header(ctype)
#             if key == 'application/x-www-form-urlencoded':
#                 args.update(
#                     parse_qs(self.content.read(), 1))
#             elif key == mfd:
#                 args.update(
#                     cgi.parse_multipart(self.content, pdict))
#             else:
#                 pass