diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5e989a5..74c0ae2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -41,7 +41,7 @@ jobs: __version__\s*=\s*(?:['"])([[:PEP440:]])(?:['"]) - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 if: steps.release.outputs.version == 0 with: python-version: ${{ matrix.python-version }} @@ -49,5 +49,6 @@ jobs: - name: Test if: steps.release.outputs.version == 0 run: | + pip install wheel pip install -e .[test] python setup.py test diff --git a/.gitignore b/.gitignore index 0d7df92..284ec93 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,5 @@ __pycache__/ /.pytest_cache /.mypy_cache /.vscode +.eggs +.venv diff --git a/.gitmodules b/.gitmodules index ac94662..52392af 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "vendor/http-parser"] path = vendor/http-parser url = https://github.com/nodejs/http-parser.git +[submodule "vendor/llhttp"] + path = vendor/llhttp + url = https://github.com/nodejs/llhttp.git diff --git a/Makefile b/Makefile index d43dc63..b34b26e 100644 --- a/Makefile +++ b/Makefile @@ -13,17 +13,16 @@ release: compile test python3 setup.py sdist upload -test: - python3 setup.py test +test: compile + python3 -m unittest -v clean: find $(ROOT)/httptools/parser -name '*.c' | xargs rm -f find $(ROOT)/httptools/parser -name '*.html' | xargs rm -f -distclean: +distclean: clean git --git-dir="$(ROOT)/vendor/http-parser/.git" clean -dfx - find $(ROOT)/httptools/parser -name '*.c' | xargs rm -f - find $(ROOT)/httptools/parser -name '*.html' | xargs rm -f + git --git-dir="$(ROOT)/vendor/llhttp/.git" clean -dfx testinstalled: diff --git a/README.md b/README.md index 8be3739..76b45d2 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,11 @@ The package is available on PyPI: `pip install httptools`. # APIs httptools contains two classes `httptools.HttpRequestParser`, -`httptools.HttpResponseParser` and a function for parsing URLs -`httptools.parse_url`. See unittests for examples. +`httptools.HttpResponseParser` (fulfilled through +[llhttp](https://github.com/nodejs/llhttp)) and a function for +parsing URLs `httptools.parse_url` (through +[http-parse](https://github.com/nodejs/http-parser) for now). +See unittests for examples. ```python diff --git a/httptools/parser/__init__.py b/httptools/parser/__init__.py index d53bad9..ba371f5 100644 --- a/httptools/parser/__init__.py +++ b/httptools/parser/__init__.py @@ -1,4 +1,5 @@ from .parser import * # NoQA from .errors import * # NoQA +from .url_parser import * # NoQA -__all__ = parser.__all__ + errors.__all__ # NoQA +__all__ = parser.__all__ + errors.__all__ + url_parser.__all__ # NoQA diff --git a/httptools/parser/cparser.pxd b/httptools/parser/cparser.pxd index bad2060..617f0c1 100644 --- a/httptools/parser/cparser.pxd +++ b/httptools/parser/cparser.pxd @@ -1,139 +1,156 @@ -from libc.stdint cimport uint16_t, uint32_t, uint64_t +from libc.stdint cimport int32_t, uint8_t, uint16_t, uint64_t -cdef extern from "../../vendor/http-parser/http_parser.h": - ctypedef int (*http_data_cb) (http_parser*, +cdef extern from "llhttp.h": + struct llhttp__internal_s: + int32_t _index + void *_span_pos0 + void *_span_cb0 + int32_t error + const char *reason + const char *error_pos + void *data + void *_current + uint64_t content_length + uint8_t type + uint8_t method + uint8_t http_major + uint8_t http_minor + uint8_t header_state + uint16_t flags + uint8_t upgrade + uint16_t status_code + uint8_t finish + void *settings + ctypedef llhttp__internal_s llhttp__internal_t + ctypedef llhttp__internal_t llhttp_t + + ctypedef int (*llhttp_data_cb) (llhttp_t*, const char *at, size_t length) except -1 - ctypedef int (*http_cb) (http_parser*) except -1 - - struct http_parser: - unsigned int type - unsigned int flags - unsigned int state - unsigned int header_state - unsigned int index - - uint32_t nread - uint64_t content_length - - unsigned short http_major - unsigned short http_minor - unsigned int status_code - unsigned int method - unsigned int http_errno - - unsigned int upgrade - - void *data - - struct http_parser_settings: - http_cb on_message_begin - http_data_cb on_url - http_data_cb on_status - http_data_cb on_header_field - http_data_cb on_header_value - http_cb on_headers_complete - http_data_cb on_body - http_cb on_message_complete - http_cb on_chunk_header - http_cb on_chunk_complete - - enum http_parser_type: + ctypedef int (*llhttp_cb) (llhttp_t*) except -1 + + struct llhttp_settings_s: + llhttp_cb on_message_begin + llhttp_data_cb on_url + llhttp_data_cb on_status + llhttp_data_cb on_header_field + llhttp_data_cb on_header_value + llhttp_cb on_headers_complete + llhttp_data_cb on_body + llhttp_cb on_message_complete + llhttp_cb on_chunk_header + llhttp_cb on_chunk_complete + ctypedef llhttp_settings_s llhttp_settings_t + + enum llhttp_type: + HTTP_BOTH, HTTP_REQUEST, - HTTP_RESPONSE, - HTTP_BOTH + HTTP_RESPONSE + ctypedef llhttp_type llhttp_type_t - enum http_errno: + enum llhttp_errno: HPE_OK, - HPE_CB_message_begin, - HPE_CB_url, - HPE_CB_header_field, - HPE_CB_header_value, - HPE_CB_headers_complete, - HPE_CB_body, - HPE_CB_message_complete, - HPE_CB_status, - HPE_CB_chunk_header, - HPE_CB_chunk_complete, - HPE_INVALID_EOF_STATE, - HPE_HEADER_OVERFLOW, + HPE_INTERNAL, + HPE_STRICT, + HPE_LF_EXPECTED, + HPE_UNEXPECTED_CONTENT_LENGTH, HPE_CLOSED_CONNECTION, - HPE_INVALID_VERSION, - HPE_INVALID_STATUS, HPE_INVALID_METHOD, HPE_INVALID_URL, - HPE_INVALID_HOST, - HPE_INVALID_PORT, - HPE_INVALID_PATH, - HPE_INVALID_QUERY_STRING, - HPE_INVALID_FRAGMENT, - HPE_LF_EXPECTED, + HPE_INVALID_CONSTANT, + HPE_INVALID_VERSION, HPE_INVALID_HEADER_TOKEN, HPE_INVALID_CONTENT_LENGTH, HPE_INVALID_CHUNK_SIZE, - HPE_INVALID_CONSTANT, - HPE_INVALID_INTERNAL_STATE, - HPE_STRICT, + HPE_INVALID_STATUS, + HPE_INVALID_EOF_STATE, + HPE_INVALID_TRANSFER_ENCODING, + HPE_CB_MESSAGE_BEGIN, + HPE_CB_HEADERS_COMPLETE, + HPE_CB_MESSAGE_COMPLETE, + HPE_CB_CHUNK_HEADER, + HPE_CB_CHUNK_COMPLETE, HPE_PAUSED, - HPE_UNKNOWN + HPE_PAUSED_UPGRADE, + HPE_USER + ctypedef llhttp_errno llhttp_errno_t - enum flags: - F_CHUNKED, + enum llhttp_flags: F_CONNECTION_KEEP_ALIVE, F_CONNECTION_CLOSE, F_CONNECTION_UPGRADE, - F_TRAILING, + F_CHUNKED, F_UPGRADE, - F_SKIPBODY - - enum http_method: - DELETE, GET, HEAD, POST, PUT, CONNECT, OPTIONS, TRACE, COPY, - LOCK, MKCOL, MOVE, PROPFIND, PROPPATCH, SEARCH, UNLOCK, BIND, - REBIND, UNBIND, ACL, REPORT, MKACTIVITY, CHECKOUT, MERGE, - MSEARCH, NOTIFY, SUBSCRIBE, UNSUBSCRIBE, PATCH, PURGE, MKCALENDAR, - LINK, UNLINK - - void http_parser_init(http_parser *parser, http_parser_type type) - - size_t http_parser_execute(http_parser *parser, - const http_parser_settings *settings, - const char *data, - size_t len) - - int http_should_keep_alive(const http_parser *parser) - - void http_parser_settings_init(http_parser_settings *settings) - - const char *http_errno_name(http_errno err) - const char *http_errno_description(http_errno err) - const char *http_method_str(http_method m) - - # URL Parser - - enum http_parser_url_fields: - UF_SCHEMA = 0, - UF_HOST = 1, - UF_PORT = 2, - UF_PATH = 3, - UF_QUERY = 4, - UF_FRAGMENT = 5, - UF_USERINFO = 6, - UF_MAX = 7 - - struct http_parser_url_field_data: - uint16_t off - uint16_t len - - struct http_parser_url: - uint16_t field_set - uint16_t port - http_parser_url_field_data[UF_MAX] field_data - - void http_parser_url_init(http_parser_url *u) - - int http_parser_parse_url(const char *buf, - size_t buflen, - int is_connect, - http_parser_url *u) + F_CONTENT_LENGTH, + F_SKIPBODY, + F_TRAILING, + F_LENIENT, + F_TRANSFER_ENCODING + ctypedef llhttp_flags llhttp_flags_t + + enum llhttp_method: + HTTP_DELETE, + HTTP_GET, + HTTP_HEAD, + HTTP_POST, + HTTP_PUT, + HTTP_CONNECT, + HTTP_OPTIONS, + HTTP_TRACE, + HTTP_COPY, + HTTP_LOCK, + HTTP_MKCOL, + HTTP_MOVE, + HTTP_PROPFIND, + HTTP_PROPPATCH, + HTTP_SEARCH, + HTTP_UNLOCK, + HTTP_BIND, + HTTP_REBIND, + HTTP_UNBIND, + HTTP_ACL, + HTTP_REPORT, + HTTP_MKACTIVITY, + HTTP_CHECKOUT, + HTTP_MERGE, + HTTP_MSEARCH, + HTTP_NOTIFY, + HTTP_SUBSCRIBE, + HTTP_UNSUBSCRIBE, + HTTP_PATCH, + HTTP_PURGE, + HTTP_MKCALENDAR, + HTTP_LINK, + HTTP_UNLINK, + HTTP_SOURCE, + HTTP_PRI, + HTTP_DESCRIBE, + HTTP_ANNOUNCE, + HTTP_SETUP, + HTTP_PLAY, + HTTP_PAUSE, + HTTP_TEARDOWN, + HTTP_GET_PARAMETER, + HTTP_SET_PARAMETER, + HTTP_REDIRECT, + HTTP_RECORD, + HTTP_FLUSH + ctypedef llhttp_method llhttp_method_t + + void llhttp_init(llhttp_t* parser, llhttp_type_t type, const llhttp_settings_t* settings) + + void llhttp_settings_init(llhttp_settings_t* settings) + + llhttp_errno_t llhttp_execute(llhttp_t* parser, const char* data, size_t len) + + void llhttp_resume_after_upgrade(llhttp_t* parser) + + int llhttp_should_keep_alive(const llhttp_t* parser) + + const char* llhttp_get_error_pos(const llhttp_t* parser) + const char* llhttp_get_error_reason(const llhttp_t* parser) + const char* llhttp_method_name(llhttp_method_t method) + + void llhttp_set_error_reason(llhttp_t* parser, const char* reason); diff --git a/httptools/parser/parser.pyx b/httptools/parser/parser.pyx index 92691c0..6877aa1 100644 --- a/httptools/parser/parser.pyx +++ b/httptools/parser/parser.pyx @@ -19,15 +19,15 @@ cimport cython from . cimport cparser -__all__ = ('HttpRequestParser', 'HttpResponseParser', 'parse_url') +__all__ = ('HttpRequestParser', 'HttpResponseParser') @cython.internal cdef class HttpParser: cdef: - cparser.http_parser* _cparser - cparser.http_parser_settings* _csettings + cparser.llhttp_t* _cparser + cparser.llhttp_settings_t* _csettings bytes _current_header_name bytes _current_header_value @@ -42,13 +42,13 @@ cdef class HttpParser: Py_buffer py_buf def __cinit__(self): - self._cparser = \ - PyMem_Malloc(sizeof(cparser.http_parser)) + self._cparser = \ + PyMem_Malloc(sizeof(cparser.llhttp_t)) if self._cparser is NULL: raise MemoryError() - self._csettings = \ - PyMem_Malloc(sizeof(cparser.http_parser_settings)) + self._csettings = \ + PyMem_Malloc(sizeof(cparser.llhttp_settings_t)) if self._csettings is NULL: raise MemoryError() @@ -56,11 +56,11 @@ cdef class HttpParser: PyMem_Free(self._cparser) PyMem_Free(self._csettings) - cdef _init(self, protocol, cparser.http_parser_type mode): - cparser.http_parser_init(self._cparser, mode) - self._cparser.data = self + cdef _init(self, protocol, cparser.llhttp_type_t mode): + cparser.llhttp_settings_init(self._csettings) - cparser.http_parser_settings_init(self._csettings) + cparser.llhttp_init(self._cparser, mode, self._csettings) + self._cparser.data = self self._current_header_name = None self._current_header_value = None @@ -145,59 +145,72 @@ cdef class HttpParser: ### Public API ### def get_http_version(self): - cdef cparser.http_parser* parser = self._cparser + cdef cparser.llhttp_t* parser = self._cparser return '{}.{}'.format(parser.http_major, parser.http_minor) def should_keep_alive(self): - return bool(cparser.http_should_keep_alive(self._cparser)) + return bool(cparser.llhttp_should_keep_alive(self._cparser)) def should_upgrade(self): - cdef cparser.http_parser* parser = self._cparser + cdef cparser.llhttp_t* parser = self._cparser return bool(parser.upgrade) def feed_data(self, data): cdef: size_t data_len - size_t nb + cparser.llhttp_errno_t err Py_buffer *buf + bint owning_buf = False + char* err_pos if PyMemoryView_Check(data): buf = PyMemoryView_GET_BUFFER(data) data_len = buf.len - nb = cparser.http_parser_execute( + err = cparser.llhttp_execute( self._cparser, - self._csettings, buf.buf, data_len) else: buf = &self.py_buf PyObject_GetBuffer(data, buf, PyBUF_SIMPLE) + owning_buf = True data_len = buf.len - nb = cparser.http_parser_execute( + err = cparser.llhttp_execute( self._cparser, - self._csettings, buf.buf, data_len) - PyBuffer_Release(buf) - - if self._cparser.http_errno != cparser.HPE_OK: - ex = parser_error_from_errno( - self._cparser.http_errno) + try: + if self._cparser.upgrade == 1 and err == cparser.HPE_PAUSED_UPGRADE: + err_pos = cparser.llhttp_get_error_pos(self._cparser) + + # Immediately free the parser from "error" state, simulating + # http-parser behavior here because 1) we never had the API to + # allow users manually "resume after upgrade", and 2) the use + # case for resuming parsing is very rare. + cparser.llhttp_resume_after_upgrade(self._cparser) + + # The err_pos here is specific for the input buf. So if we ever + # switch to the llhttp behavior (re-raise HttpParserUpgrade for + # successive calls to feed_data() until resume_after_upgrade is + # called), we have to store the result and keep our own state. + raise HttpParserUpgrade(err_pos - buf.buf) + finally: + if owning_buf: + PyBuffer_Release(buf) + + if err != cparser.HPE_OK: + ex = parser_error_from_errno( + self._cparser, + self._cparser.error) if isinstance(ex, HttpParserCallbackError): if self._last_error is not None: ex.__context__ = self._last_error self._last_error = None raise ex - if self._cparser.upgrade: - raise HttpParserUpgrade(nb) - - if nb != data_len: - raise HttpParserError('not all of the data was parsed') - cdef class HttpRequestParser(HttpParser): @@ -209,8 +222,8 @@ cdef class HttpRequestParser(HttpParser): self._csettings.on_url = cb_on_url def get_method(self): - cdef cparser.http_parser* parser = self._cparser - return cparser.http_method_str( parser.method) + cdef cparser.llhttp_t* parser = self._cparser + return cparser.llhttp_method_name( parser.method) cdef class HttpResponseParser(HttpParser): @@ -223,11 +236,11 @@ cdef class HttpResponseParser(HttpParser): self._csettings.on_status = cb_on_status def get_status_code(self): - cdef cparser.http_parser* parser = self._cparser + cdef cparser.llhttp_t* parser = self._cparser return parser.status_code -cdef int cb_on_message_begin(cparser.http_parser* parser) except -1: +cdef int cb_on_message_begin(cparser.llhttp_t* parser) except -1: cdef HttpParser pyparser = parser.data try: pyparser._proto_on_message_begin() @@ -238,55 +251,59 @@ cdef int cb_on_message_begin(cparser.http_parser* parser) except -1: return 0 -cdef int cb_on_url(cparser.http_parser* parser, +cdef int cb_on_url(cparser.llhttp_t* parser, const char *at, size_t length) except -1: cdef HttpParser pyparser = parser.data try: pyparser._proto_on_url(at[:length]) except BaseException as ex: + cparser.llhttp_set_error_reason(parser, "`on_url` callback error") pyparser._last_error = ex - return -1 + return cparser.HPE_USER else: return 0 -cdef int cb_on_status(cparser.http_parser* parser, +cdef int cb_on_status(cparser.llhttp_t* parser, const char *at, size_t length) except -1: cdef HttpParser pyparser = parser.data try: pyparser._proto_on_status(at[:length]) except BaseException as ex: + cparser.llhttp_set_error_reason(parser, "`on_status` callback error") pyparser._last_error = ex - return -1 + return cparser.HPE_USER else: return 0 -cdef int cb_on_header_field(cparser.http_parser* parser, +cdef int cb_on_header_field(cparser.llhttp_t* parser, const char *at, size_t length) except -1: cdef HttpParser pyparser = parser.data try: pyparser._on_header_field(at[:length]) except BaseException as ex: + cparser.llhttp_set_error_reason(parser, "`on_header_field` callback error") pyparser._last_error = ex - return -1 + return cparser.HPE_USER else: return 0 -cdef int cb_on_header_value(cparser.http_parser* parser, +cdef int cb_on_header_value(cparser.llhttp_t* parser, const char *at, size_t length) except -1: cdef HttpParser pyparser = parser.data try: pyparser._on_header_value(at[:length]) except BaseException as ex: + cparser.llhttp_set_error_reason(parser, "`on_header_value` callback error") pyparser._last_error = ex - return -1 + return cparser.HPE_USER else: return 0 -cdef int cb_on_headers_complete(cparser.http_parser* parser) except -1: +cdef int cb_on_headers_complete(cparser.llhttp_t* parser) except -1: cdef HttpParser pyparser = parser.data try: pyparser._on_headers_complete() @@ -300,19 +317,20 @@ cdef int cb_on_headers_complete(cparser.http_parser* parser) except -1: return 0 -cdef int cb_on_body(cparser.http_parser* parser, +cdef int cb_on_body(cparser.llhttp_t* parser, const char *at, size_t length) except -1: cdef HttpParser pyparser = parser.data try: pyparser._proto_on_body(at[:length]) except BaseException as ex: + cparser.llhttp_set_error_reason(parser, "`on_body` callback error") pyparser._last_error = ex - return -1 + return cparser.HPE_USER else: return 0 -cdef int cb_on_message_complete(cparser.http_parser* parser) except -1: +cdef int cb_on_message_complete(cparser.llhttp_t* parser) except -1: cdef HttpParser pyparser = parser.data try: pyparser._proto_on_message_complete() @@ -323,7 +341,7 @@ cdef int cb_on_message_complete(cparser.http_parser* parser) except -1: return 0 -cdef int cb_on_chunk_header(cparser.http_parser* parser) except -1: +cdef int cb_on_chunk_header(cparser.llhttp_t* parser) except -1: cdef HttpParser pyparser = parser.data try: pyparser._on_chunk_header() @@ -334,7 +352,7 @@ cdef int cb_on_chunk_header(cparser.http_parser* parser) except -1: return 0 -cdef int cb_on_chunk_complete(cparser.http_parser* parser) except -1: +cdef int cb_on_chunk_complete(cparser.llhttp_t* parser) except -1: cdef HttpParser pyparser = parser.data try: pyparser._on_chunk_complete() @@ -345,19 +363,15 @@ cdef int cb_on_chunk_complete(cparser.http_parser* parser) except -1: return 0 -cdef parser_error_from_errno(cparser.http_errno errno): - cdef bytes desc = cparser.http_errno_description(errno) +cdef parser_error_from_errno(cparser.llhttp_t* parser, cparser.llhttp_errno_t errno): + cdef bytes reason = cparser.llhttp_get_error_reason(parser) - if errno in (cparser.HPE_CB_message_begin, - cparser.HPE_CB_url, - cparser.HPE_CB_header_field, - cparser.HPE_CB_header_value, - cparser.HPE_CB_headers_complete, - cparser.HPE_CB_body, - cparser.HPE_CB_message_complete, - cparser.HPE_CB_status, - cparser.HPE_CB_chunk_header, - cparser.HPE_CB_chunk_complete): + if errno in (cparser.HPE_CB_MESSAGE_BEGIN, + cparser.HPE_CB_HEADERS_COMPLETE, + cparser.HPE_CB_MESSAGE_COMPLETE, + cparser.HPE_CB_CHUNK_HEADER, + cparser.HPE_CB_CHUNK_COMPLETE, + cparser.HPE_USER): cls = HttpParserCallbackError elif errno == cparser.HPE_INVALID_STATUS: @@ -372,100 +386,4 @@ cdef parser_error_from_errno(cparser.http_errno errno): else: cls = HttpParserError - return cls(desc.decode('latin-1')) - - -@cython.freelist(250) -cdef class URL: - cdef readonly bytes schema - cdef readonly bytes host - cdef readonly object port - cdef readonly bytes path - cdef readonly bytes query - cdef readonly bytes fragment - cdef readonly bytes userinfo - - def __cinit__(self, bytes schema, bytes host, object port, bytes path, - bytes query, bytes fragment, bytes userinfo): - - self.schema = schema - self.host = host - self.port = port - self.path = path - self.query = query - self.fragment = fragment - self.userinfo = userinfo - - def __repr__(self): - return ('' - .format(self.schema, self.host, self.port, self.path, - self.query, self.fragment, self.userinfo)) - - -def parse_url(url): - cdef: - Py_buffer py_buf - char* buf_data - cparser.http_parser_url* parsed - int res - bytes schema = None - bytes host = None - object port = None - bytes path = None - bytes query = None - bytes fragment = None - bytes userinfo = None - object result = None - int off - int ln - - parsed = \ - PyMem_Malloc(sizeof(cparser.http_parser_url)) - cparser.http_parser_url_init(parsed) - - PyObject_GetBuffer(url, &py_buf, PyBUF_SIMPLE) - try: - buf_data = py_buf.buf - res = cparser.http_parser_parse_url(buf_data, py_buf.len, 0, parsed) - - if res == 0: - if parsed.field_set & (1 << cparser.UF_SCHEMA): - off = parsed.field_data[cparser.UF_SCHEMA].off - ln = parsed.field_data[cparser.UF_SCHEMA].len - schema = buf_data[off:off+ln] - - if parsed.field_set & (1 << cparser.UF_HOST): - off = parsed.field_data[cparser.UF_HOST].off - ln = parsed.field_data[cparser.UF_HOST].len - host = buf_data[off:off+ln] - - if parsed.field_set & (1 << cparser.UF_PORT): - port = parsed.port - - if parsed.field_set & (1 << cparser.UF_PATH): - off = parsed.field_data[cparser.UF_PATH].off - ln = parsed.field_data[cparser.UF_PATH].len - path = buf_data[off:off+ln] - - if parsed.field_set & (1 << cparser.UF_QUERY): - off = parsed.field_data[cparser.UF_QUERY].off - ln = parsed.field_data[cparser.UF_QUERY].len - query = buf_data[off:off+ln] - - if parsed.field_set & (1 << cparser.UF_FRAGMENT): - off = parsed.field_data[cparser.UF_FRAGMENT].off - ln = parsed.field_data[cparser.UF_FRAGMENT].len - fragment = buf_data[off:off+ln] - - if parsed.field_set & (1 << cparser.UF_USERINFO): - off = parsed.field_data[cparser.UF_USERINFO].off - ln = parsed.field_data[cparser.UF_USERINFO].len - userinfo = buf_data[off:off+ln] - - return URL(schema, host, port, path, query, fragment, userinfo) - else: - raise HttpParserInvalidURLError("invalid url {!r}".format(url)) - finally: - PyBuffer_Release(&py_buf) - PyMem_Free(parsed) + return cls(reason.decode('latin-1')) diff --git a/httptools/parser/url_cparser.pxd b/httptools/parser/url_cparser.pxd new file mode 100644 index 0000000..ab9265a --- /dev/null +++ b/httptools/parser/url_cparser.pxd @@ -0,0 +1,31 @@ +from libc.stdint cimport uint16_t + + +cdef extern from "http_parser.h": + # URL Parser + + enum http_parser_url_fields: + UF_SCHEMA = 0, + UF_HOST = 1, + UF_PORT = 2, + UF_PATH = 3, + UF_QUERY = 4, + UF_FRAGMENT = 5, + UF_USERINFO = 6, + UF_MAX = 7 + + struct http_parser_url_field_data: + uint16_t off + uint16_t len + + struct http_parser_url: + uint16_t field_set + uint16_t port + http_parser_url_field_data[UF_MAX] field_data + + void http_parser_url_init(http_parser_url *u) + + int http_parser_parse_url(const char *buf, + size_t buflen, + int is_connect, + http_parser_url *u) diff --git a/httptools/parser/url_parser.pyx b/httptools/parser/url_parser.pyx new file mode 100644 index 0000000..49908f3 --- /dev/null +++ b/httptools/parser/url_parser.pyx @@ -0,0 +1,108 @@ +#cython: language_level=3 + +from __future__ import print_function +from cpython.mem cimport PyMem_Malloc, PyMem_Free +from cpython cimport PyObject_GetBuffer, PyBuffer_Release, PyBUF_SIMPLE, \ + Py_buffer + +from .errors import HttpParserInvalidURLError + +cimport cython +from . cimport url_cparser as uparser + +__all__ = ('parse_url',) + +@cython.freelist(250) +cdef class URL: + cdef readonly bytes schema + cdef readonly bytes host + cdef readonly object port + cdef readonly bytes path + cdef readonly bytes query + cdef readonly bytes fragment + cdef readonly bytes userinfo + + def __cinit__(self, bytes schema, bytes host, object port, bytes path, + bytes query, bytes fragment, bytes userinfo): + + self.schema = schema + self.host = host + self.port = port + self.path = path + self.query = query + self.fragment = fragment + self.userinfo = userinfo + + def __repr__(self): + return ('' + .format(self.schema, self.host, self.port, self.path, + self.query, self.fragment, self.userinfo)) + + +def parse_url(url): + cdef: + Py_buffer py_buf + char* buf_data + uparser.http_parser_url* parsed + int res + bytes schema = None + bytes host = None + object port = None + bytes path = None + bytes query = None + bytes fragment = None + bytes userinfo = None + object result = None + int off + int ln + + parsed = \ + PyMem_Malloc(sizeof(uparser.http_parser_url)) + uparser.http_parser_url_init(parsed) + + PyObject_GetBuffer(url, &py_buf, PyBUF_SIMPLE) + try: + buf_data = py_buf.buf + res = uparser.http_parser_parse_url(buf_data, py_buf.len, 0, parsed) + + if res == 0: + if parsed.field_set & (1 << uparser.UF_SCHEMA): + off = parsed.field_data[uparser.UF_SCHEMA].off + ln = parsed.field_data[uparser.UF_SCHEMA].len + schema = buf_data[off:off+ln] + + if parsed.field_set & (1 << uparser.UF_HOST): + off = parsed.field_data[uparser.UF_HOST].off + ln = parsed.field_data[uparser.UF_HOST].len + host = buf_data[off:off+ln] + + if parsed.field_set & (1 << uparser.UF_PORT): + port = parsed.port + + if parsed.field_set & (1 << uparser.UF_PATH): + off = parsed.field_data[uparser.UF_PATH].off + ln = parsed.field_data[uparser.UF_PATH].len + path = buf_data[off:off+ln] + + if parsed.field_set & (1 << uparser.UF_QUERY): + off = parsed.field_data[uparser.UF_QUERY].off + ln = parsed.field_data[uparser.UF_QUERY].len + query = buf_data[off:off+ln] + + if parsed.field_set & (1 << uparser.UF_FRAGMENT): + off = parsed.field_data[uparser.UF_FRAGMENT].off + ln = parsed.field_data[uparser.UF_FRAGMENT].len + fragment = buf_data[off:off+ln] + + if parsed.field_set & (1 << uparser.UF_USERINFO): + off = parsed.field_data[uparser.UF_USERINFO].off + ln = parsed.field_data[uparser.UF_USERINFO].len + userinfo = buf_data[off:off+ln] + + return URL(schema, host, port, path, query, fragment, userinfo) + else: + raise HttpParserInvalidURLError("invalid url {!r}".format(url)) + finally: + PyBuffer_Release(&py_buf) + PyMem_Free(parsed) diff --git a/setup.py b/setup.py index ee6bd7d..a1791b2 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,8 @@ class httptools_build_ext(build_ext): 'Produce a colorized HTML version of the Cython source.'), ('cython-directives=', None, 'Cythion compiler directives'), + ('use-system-llhttp', None, + 'Use the system provided llhttp, instead of the bundled one'), ('use-system-http-parser', None, 'Use the system provided http-parser, instead of the bundled one'), ] @@ -33,6 +35,7 @@ class httptools_build_ext(build_ext): boolean_options = build_ext.boolean_options + [ 'cython-always', 'cython-annotate', + 'use-system-llhttp', 'use-system-http-parser', ] @@ -44,6 +47,7 @@ def initialize_options(self): return super().initialize_options() + self.use_system_llhttp = False self.use_system_http_parser = False self.cython_always = False self.cython_annotate = None @@ -108,16 +112,34 @@ def finalize_options(self): self._initialized = True def build_extensions(self): + mod_parser, mod_url_parser = self.distribution.ext_modules + if self.use_system_llhttp: + mod_parser.libraries.append('llhttp') + + if sys.platform == 'darwin' and \ + os.path.exists('/opt/local/include'): + # Support macports on Mac OS X. + mod_parser.include_dirs.append('/opt/local/include') + else: + mod_parser.include_dirs.append( + str(ROOT / 'vendor' / 'llhttp' / 'include')) + mod_parser.include_dirs.append( + str(ROOT / 'vendor' / 'llhttp' / 'src')) + mod_parser.sources.append('vendor/llhttp/src/api.c') + mod_parser.sources.append('vendor/llhttp/src/http.c') + mod_parser.sources.append('vendor/llhttp/src/llhttp.c') + if self.use_system_http_parser: - self.compiler.add_library('http_parser') + mod_url_parser.libraries.append('http_parser') if sys.platform == 'darwin' and \ os.path.exists('/opt/local/include'): # Support macports on Mac OS X. - self.compiler.add_include_dir('/opt/local/include') + mod_url_parser.include_dirs.append('/opt/local/include') else: - self.compiler.add_include_dir(str(ROOT / 'vendor' / 'http-parser')) - self.distribution.ext_modules[0].sources.append( + mod_url_parser.include_dirs.append( + str(ROOT / 'vendor' / 'http-parser')) + mod_url_parser.sources.append( 'vendor/http-parser/http_parser.c') super().build_extensions() @@ -179,6 +201,13 @@ def build_extensions(self): ], extra_compile_args=CFLAGS, ), + Extension( + "httptools.parser.url_parser", + sources=[ + "httptools/parser/url_parser.pyx", + ], + extra_compile_args=CFLAGS, + ), ], include_package_data=True, test_suite='tests.suite', diff --git a/tests/test_parser.py b/tests/test_parser.py index 94a5839..f4136d6 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -103,7 +103,7 @@ def test_parser_response_1(self): with self.assertRaisesRegex( httptools.HttpParserError, - 'data received after completed connection'): + 'Expected HTTP/'): p.feed_data(b'12123123') def test_parser_response_2(self): @@ -117,7 +117,7 @@ def test_parser_response_3(self): for cbname in callbacks: with self.subTest('{} callback fails correctly'.format(cbname)): with self.assertRaisesRegex(httptools.HttpParserCallbackError, - 'callback failed'): + 'callback error'): m = mock.Mock() getattr(m, cbname).side_effect = Exception() @@ -225,7 +225,6 @@ def test_parser_request_chunked_1(self): p = httptools.HttpRequestParser(m) p.feed_data(CHUNKED_REQUEST1_1) - self.assertEqual(p.get_method(), b'POST') m.on_message_begin.assert_called_once_with() @@ -348,6 +347,11 @@ def test_parser_request_upgrade_1(self): b'Host': b'example.com', b'Upgrade': b'WebSocket'}) + # The parser can be used again for further parsing - this is a legacy + # behavior from the time we were still using http-parser. + p.feed_data(CHUNKED_REQUEST1_1) + self.assertEqual(p.get_method(), b'POST') + def test_parser_request_upgrade_flag(self): class Protocol: diff --git a/vendor/llhttp b/vendor/llhttp new file mode 160000 index 0000000..3523423 --- /dev/null +++ b/vendor/llhttp @@ -0,0 +1 @@ +Subproject commit 3523423483a61179f47cc7ff0da012fb6f81ec1b