From 64e271dce2e5aca07abdca8e335b3f7cb9ba1114 Mon Sep 17 00:00:00 2001 From: "Felix (xq) Queißner" Date: Sun, 7 Mar 2021 11:58:37 +0100 Subject: Starts to implement new HTML renderer based on gumbo. Everything is a bit borked and only trivial documents work correctly. --- src/browsertab.cpp | 31 +-- src/dialogs/settingsdialog.cpp | 4 +- src/kristall.pro | 10 + src/renderers/geminirenderer.cpp | 9 +- src/renderers/geminirenderer.hpp | 3 +- src/renderers/htmlrenderer.cpp | 447 ++++++++++++++++++++++++++++++++++++ src/renderers/htmlrenderer.hpp | 31 +++ src/renderers/textstyleinstance.cpp | 3 +- 8 files changed, 509 insertions(+), 29 deletions(-) create mode 100644 src/renderers/htmlrenderer.cpp create mode 100644 src/renderers/htmlrenderer.hpp (limited to 'src') diff --git a/src/browsertab.cpp b/src/browsertab.cpp index bc36241..dc85078 100644 --- a/src/browsertab.cpp +++ b/src/browsertab.cpp @@ -6,6 +6,7 @@ #include "renderers/geminirenderer.hpp" #include "renderers/plaintextrenderer.hpp" #include "renderers/markdownrenderer.hpp" +#include "renderers/htmlrenderer.hpp" #include "renderers/renderhelpers.hpp" #include "mimeparser.hpp" @@ -631,7 +632,7 @@ void BrowserTab::renderPage(const QByteArray &data, const MimeType &mime) this->current_location, doc_style, this->outline, - &this->page_title); + this->page_title); } else if (not plaintext_only and mime.is("text","gophermap")) { @@ -642,23 +643,12 @@ void BrowserTab::renderPage(const QByteArray &data, const MimeType &mime) } else if (not plaintext_only and mime.is("text","html")) { - document = std::make_unique(); - - document->setDefaultFont(doc_style.standard_font); - document->setDefaultStyleSheet(doc_style.toStyleSheet()); - renderhelpers::setPageMargins(document.get(), doc_style.margin_h, doc_style.margin_v); - - // Strip inline styles from page, so they don't - // conflict with user styles. - QString page_html = QString::fromUtf8(data); - page_html.replace(QRegularExpression("[\\S\\s]*?", QRegularExpression::CaseInsensitiveOption), ""); - - // Strip bgcolor attribute from body. These can screw up user styles too. - page_html.replace(QRegularExpression("", QRegularExpression::CaseInsensitiveOption), ""); - - document->setHtml(page_html); - - page_title = document->metaInformation(QTextDocument::DocumentTitle); + document = HtmlRenderer::render( + data, + this->current_location, + doc_style, + this->outline, + this->page_title); } else if (not plaintext_only and mime.is("text","x-kristall-theme")) { @@ -685,7 +675,8 @@ void BrowserTab::renderPage(const QByteArray &data, const MimeType &mime) src.readAll(), this->current_location, preview_style, - this->outline); + this->outline, + this->page_title); this->ui->text_browser->setStyleSheet(QString("QTextBrowser { background-color: %1; color: %2; }") .arg(preview_style.background_color.name(), preview_style.standard_color.name())); @@ -788,7 +779,7 @@ void BrowserTab::renderPage(const QByteArray &data, const MimeType &mime) this->current_location, doc_style, this->outline, - &this->page_title); + this->page_title); will_cache = false; } diff --git a/src/dialogs/settingsdialog.cpp b/src/dialogs/settingsdialog.cpp index 9a65efd..d71c23e 100644 --- a/src/dialogs/settingsdialog.cpp +++ b/src/dialogs/settingsdialog.cpp @@ -387,12 +387,14 @@ void SettingsDialog::reloadStylePreview() QUrl url { QUrl(QString("about://%1/foobar").arg(host)) }; DocumentOutlineModel outline; + QString document_title; auto doc_style = current_style.derive(url); auto doc = GeminiRenderer::render( document, url, doc_style, - outline + outline, + document_title ); ui->style_preview->setStyleSheet(QString("QTextBrowser { background-color: %1; color: %2; }") diff --git a/src/kristall.pro b/src/kristall.pro index e2b80bf..03698e6 100644 --- a/src/kristall.pro +++ b/src/kristall.pro @@ -83,6 +83,14 @@ external-cmark { include($$PWD/../lib/cmark/cmark.pri) } + +external-gumbo-parser { + CONFIG += link_pkgconfig + PKGCONFIG += gumbo-parser +} else { + include($$PWD/../lib/gumbo-parser/gumbo-parser.pri) +} + INCLUDEPATH += $$PWD/../lib/luis-l-gist/ DEPENDPATH += $$PWD/../lib/luis-l-gist/ @@ -103,6 +111,7 @@ SOURCES += \ ioutil.cpp \ main.cpp \ mainwindow.cpp \ + renderers/htmlrenderer.cpp \ renderers/markdownrenderer.cpp \ renderers/renderhelpers.cpp \ renderers/textstyleinstance.cpp \ @@ -151,6 +160,7 @@ HEADERS += \ ioutil.hpp \ kristall.hpp \ mainwindow.hpp \ + renderers/htmlrenderer.hpp \ renderers/markdownrenderer.hpp \ renderers/textstyleinstance.hpp \ widgets/browsertabbar.hpp \ diff --git a/src/renderers/geminirenderer.cpp b/src/renderers/geminirenderer.cpp index 6bcd3a4..3a49706 100644 --- a/src/renderers/geminirenderer.cpp +++ b/src/renderers/geminirenderer.cpp @@ -36,7 +36,7 @@ std::unique_ptr GeminiRenderer::render( QUrl const &root_url, DocumentStyle const & themed_style, DocumentOutlineModel &outline, - QString* const page_title) + QString & page_title) { TextStyleInstance text_style { themed_style }; @@ -72,8 +72,7 @@ std::unique_ptr GeminiRenderer::render( // Set the last line of the preformatted block to have // standard line height. QTextBlockFormat fmt = text_style.preformatted_format; - fmt.setLineHeight(themed_style.line_height_p, - QTextBlockFormat::LineDistanceHeight); + fmt.setLineHeight(themed_style.line_height_p, QTextBlockFormat::LineDistanceHeight); cursor.movePosition(QTextCursor::PreviousBlock); cursor.setBlockFormat(fmt); @@ -190,9 +189,9 @@ std::unique_ptr GeminiRenderer::render( outline.appendH1(heading, id); // Use first heading as the page's title. - if (page_title != nullptr && page_title->isEmpty()) + if (page_title.isEmpty()) { - *page_title = heading; + page_title = heading; } // Centre the first heading. We can't use the above code block diff --git a/src/renderers/geminirenderer.hpp b/src/renderers/geminirenderer.hpp index 65fdcf1..33f8136 100644 --- a/src/renderers/geminirenderer.hpp +++ b/src/renderers/geminirenderer.hpp @@ -28,12 +28,13 @@ struct GeminiRenderer //! @param root_url The url that is used to resolve relative links //! @param style The style which is used to render the document //! @param outline The extracted outline from the document + //! @param page_title The extracted page title static std::unique_ptr render( QByteArray const & input, QUrl const & root_url, DocumentStyle const & style, DocumentOutlineModel & outline, - QString* const page_title = nullptr + QString & page_title ); }; diff --git a/src/renderers/htmlrenderer.cpp b/src/renderers/htmlrenderer.cpp new file mode 100644 index 0000000..e1cd2da --- /dev/null +++ b/src/renderers/htmlrenderer.cpp @@ -0,0 +1,447 @@ +#include "htmlrenderer.hpp" + +#include "renderhelpers.hpp" +#include "textstyleinstance.hpp" +#include "gumbo.h" + +#include +#include +#include + +static void* malloc_wrapper(void*, size_t size) { return malloc(size); } + +static void free_wrapper(void*, void* ptr) { free(ptr); } + +static GumboOptions const gumbo_options = { + &malloc_wrapper, &free_wrapper, // memory management + nullptr, // user pointer + 4, // tab width + false, // stop on first error + -1, // maximum numbers of errors (-1 = infinite) + GUMBO_TAG_LAST, + GUMBO_NAMESPACE_HTML +}; + +static void destroyGumboOutput(GumboOutput * output) +{ + gumbo_destroy_output(&gumbo_options, output); +} + +static const char* find_title(const GumboNode* root) { + assert(root->type == GUMBO_NODE_ELEMENT); + if(root->v.element.children.length < 2) + return nullptr; + + const GumboVector* root_children = &root->v.element.children; + GumboNode* head = nullptr; + for (size_t i = 0; i < root_children->length; ++i) { + GumboNode* child = (GumboNode*)root_children->data[i]; + if (child->type == GUMBO_NODE_ELEMENT and child->v.element.tag == GUMBO_TAG_HEAD) { + head = child; + break; + } + } + if(head == nullptr) + return nullptr; + + GumboVector* head_children = &head->v.element.children; + for (size_t i = 0; i < head_children->length; ++i) { + GumboNode* child = (GumboNode*)head_children->data[i]; + if (child->type == GUMBO_NODE_ELEMENT and child->v.element.tag == GUMBO_TAG_TITLE) { + if (child->v.element.children.length != 1) { + return ""; + } + GumboNode* title_text = (GumboNode*)child->v.element.children.data[0]; + if(title_text->type == GUMBO_NODE_TEXT or title_text->type == GUMBO_NODE_WHITESPACE) + return title_text->v.text.text; + return nullptr; + } + } + return nullptr; +} + +struct RenderState +{ + QTextCursor cursor; + TextStyleInstance text_style; + QUrl root_url; + DocumentStyle const * style; + DocumentOutlineModel * outline; +}; + +static char const * getAttribute(GumboElement const & element, char const * attrib_name) +{ + for(size_t i = 0; i < element.attributes.length; i++) + { + auto const attrib = static_cast(element.attributes.data[i]); + if(strcmp(attrib->name, attrib_name) == 0) + return attrib->value; + } + return nullptr; +} + +struct TextFormatReset +{ + QTextCursor * cursor; + + QTextCharFormat char_format; + QTextBlockFormat block_format; + + TextFormatReset(QTextCursor * cursor) : + cursor(cursor), + char_format(cursor->charFormat()), + block_format(cursor->blockFormat()) + { + + } + + TextFormatReset(TextFormatReset const &) = delete; + TextFormatReset(TextFormatReset &&) = delete; + + ~TextFormatReset() + { + this->cursor->setCharFormat(this->char_format); + this->cursor->setBlockFormat(this->block_format); + } + +}; + +// Problems: +// Style/theme elements must use a push/pop +// use instead of "replacing" styles +// Otherwise,

Foo

will be rendered as a link, not as a heading. +// Should be combined here. + +static void renderRecursive(RenderState & state, GumboNode const & node, int nesting = 0) +{ + auto & cursor = state.cursor; + auto & text_style = state.text_style; + switch(node.type) + { + /** Document node. v will be a GumboDocument. */ + case GUMBO_NODE_DOCUMENT: { + qWarning() << "Detected embedded document"; + } + + /** Element node. v will be a GumboElement. */ + case GUMBO_NODE_ELEMENT: { + auto const & element = node.v.element; + + TextFormatReset format_reset { &cursor }; + + // qDebug() << "begin node(" << gumbo_normalized_tagname(element.tag) << ")"; + + switch(element.tag) { + + // Stripped tags + case GUMBO_TAG_STYLE: + case GUMBO_TAG_SCRIPT: + return; + + case GUMBO_TAG_NAV: { + // TODO: Optionally strip navigation from sites + if(true) + return; + break; + } + + // Terminal tags + case GUMBO_TAG_IMG: { + // TODO: Insert link to image here + cursor.insertText("[IMG]"); + return; + } + case GUMBO_TAG_SVG: { + // TODO: Insert link to image here + cursor.insertText("[SVG]"); + return; + } + case GUMBO_TAG_BUTTON: { + // TODO: Insert link to image here + cursor.insertText("[BUTTON]"); + return; + } + case GUMBO_TAG_INPUT: { + // TODO: Insert link to image here + cursor.insertText("[INPUT]"); + return; + } + + // Paragraph-like elements: + case GUMBO_TAG_DIV: //
is the same as

for us + case GUMBO_TAG_P: { + // cursor.insertBlock(); + break; + } + case GUMBO_TAG_H1: { + // cursor.insertBlock(); + cursor.setBlockFormat(text_style.heading_format); + cursor.setCharFormat(text_style.standard_h1); + break; + } + case GUMBO_TAG_H2: { + // cursor.insertBlock(); + cursor.setBlockFormat(text_style.heading_format); + cursor.setCharFormat(text_style.standard_h2); + + break; + } + case GUMBO_TAG_H3: { + // cursor.insertBlock(); + cursor.setBlockFormat(text_style.heading_format); + cursor.setCharFormat(text_style.standard_h3); + break; + } + + case GUMBO_TAG_PRE: { + // cursor.insertBlock(); + cursor.setBlockFormat(text_style.preformatted_format); + cursor.setCharFormat(text_style.preformatted); + break; + } + + case GUMBO_TAG_OL: + case GUMBO_TAG_UL: { + // cursor.insertBlock(); + + if(element.tag == GUMBO_TAG_OL) { + auto fmt = text_style.list_format; + fmt.setStyle(QTextListFormat::ListDecimal); + fmt.setNumberPrefix(""); + fmt.setNumberSuffix("."); + cursor.createList(fmt); + } + else { + cursor.createList(text_style.list_format); + } + break; + } + case GUMBO_TAG_LI: { + break; + } + + + // Text modification elements: + case GUMBO_TAG_SPAN: { + // This usually has a style change, but we ignore that completly + break; + } + case GUMBO_TAG_BR: { + cursor.insertText("\n"); + break; + } + case GUMBO_TAG_I: { + auto fmt = cursor.charFormat(); + fmt.setFontItalic(true); + cursor.setCharFormat(fmt); + break; + } + case GUMBO_TAG_B: { + auto fmt = cursor.charFormat(); + fmt.setFontWeight(QFont::Bold); + cursor.setCharFormat(fmt); + break; + } + case GUMBO_TAG_U: { + auto fmt = cursor.charFormat(); + fmt.setFontUnderline(true); + cursor.setCharFormat(fmt); + break; + } + case GUMBO_TAG_A: { + char const * anchor = getAttribute(element, "href"); + if(anchor == nullptr) { + anchor = "#"; + } + + auto fmt = text_style.standard_link; + fmt.setAnchor(true); + fmt.setAnchorHref(QString::fromUtf8(anchor)); + cursor.setBlockFormat(text_style.link_format); + cursor.setCharFormat(fmt); + break; + } + case GUMBO_TAG_BLOCKQUOTE: { + QTextTable *table = cursor.insertTable(1, 1,text_style.blockquote_tableformat); + cursor.setBlockFormat(text_style.blockquote_format); + QTextTableCell cell = table->cellAt(0, 0); + cell.setFormat(text_style.blockquote); + + cursor.setCharFormat(text_style.blockquote); + + break; + } + default: + qDebug() << "unhandled tag:" << gumbo_normalized_tagname(element.tag); + break; + } + + for (size_t i = 0; i < element.children.length; ++i) { + GumboNode* child = (GumboNode*)element.children.data[i]; + renderRecursive(state, *child, nesting + 1); + } + + switch(element.tag) { + // case GUMBO_TAG_PRE: { +// // Set the last line of the preformatted block to have +// // standard line height. +// QTextBlockFormat fmt = cursor.blockFormat(); +// fmt.setLineHeight(state.style->line_height_p, QTextBlockFormat::LineDistanceHeight); +// cursor.movePosition(QTextCursor::PreviousBlock); +// cursor.setBlockFormat(fmt); + +// cursor.movePosition(QTextCursor::NextBlock); +// break; +// } + + // Requires closing block + case GUMBO_TAG_PRE: + case GUMBO_TAG_P: + case GUMBO_TAG_DIV: + case GUMBO_TAG_H1: + case GUMBO_TAG_H2: + case GUMBO_TAG_H3: + cursor.insertBlock(); + break; + + case GUMBO_TAG_OL: + case GUMBO_TAG_UL: + // cursor.insertBlock(); + break; + + case GUMBO_TAG_LI: + // Terminate the

  • by pressing "enter" + cursor.insertBlock(); + break; + + case GUMBO_TAG_BLOCKQUOTE: + cursor.deletePreviousChar(); + cursor.movePosition(QTextCursor::NextBlock); + break; + + default: break; + } + + // qDebug() << "end node(" << gumbo_normalized_tagname(element.tag) << ")"; + + break; + } + + /** Text node. v will be a GumboText. */ + case GUMBO_NODE_TEXT: { + auto const & text = node.v.text; + + auto contents = QString::fromUtf8(text.text); + // qDebug() << contents; + + QRegularExpression regex { "\\s+", QRegularExpression::DotMatchesEverythingOption }; + + // TODO: This is not quite right, but QTextCursor::inserText + // will insert spurious blocks when a "\n" is encountered. + state.cursor.insertText(contents.replace(regex, " ")); + break; + } + + /** CDATA node. v will be a GumboText. */ + case GUMBO_NODE_CDATA: { + auto const & text = node.v.text; + + auto const contents = QString::fromUtf8(text.text); + + // TODO: This is not quite right, but QTextCursor::inserText + // will insert spurious blocks when a "\n" is encountered. + state.cursor.insertText(contents.trimmed()); + break; + } + + /** Comment node. v will be a GumboText, excluding comment delimiters. */ + case GUMBO_NODE_COMMENT: { + // qDebug() << "comment(" << ")"; + break; + } + + /** Text node, where all contents is whitespace. v will be a GumboText. */ + case GUMBO_NODE_WHITESPACE: { + // qDebug() << "whitespace(" << ")"; + break; + } + + /** Template node. This is separate from GUMBO_NODE_ELEMENT because many + * client libraries will want to ignore the contents of template nodes, as + * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing + * here, while clients that want to include template contents should also + * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */ + case GUMBO_NODE_TEMPLATE: { + qDebug() << "template(" << "???" << ")"; + break; + } + } +} + +std::unique_ptr HtmlRenderer::render( + QByteArray const &input, + QUrl const & root_url, + DocumentStyle const & style, + DocumentOutlineModel & outline, + QString & page_title) +{ + std::unique_ptr gumbo_output { + gumbo_parse_with_options(&gumbo_options, input.data(), input.length()), + &destroyGumboOutput, + }; + + if(gumbo_output->errors.length > 0) { + qDebug() << "Parsing the html document yielded" << gumbo_output->errors.length << "errors!"; + } + + if(gumbo_output->root->type != GUMBO_NODE_ELEMENT) { + qWarning() << "html document has no proper root node!"; + return nullptr; + } + + auto doc = std::make_unique(); + renderhelpers::setPageMargins(doc.get(), style.margin_h, style.margin_v); + doc->setIndentWidth(style.indent_size); + + outline.beginBuild(); + + // Find page title + { + const char* title = find_title(gumbo_output->root); + if(title != nullptr) { + page_title = QString::fromUtf8(title); + } + } + + { + GumboVector const * const root_children = &gumbo_output->root->v.element.children; + GumboNode* body = nullptr; + for (size_t i = 0; i < root_children->length; ++i) { + GumboNode* child = (GumboNode*)root_children->data[i]; + if (child->type == GUMBO_NODE_ELEMENT and child->v.element.tag == GUMBO_TAG_BODY) { + body = child; + break; + } + } + if(body != nullptr) + { + RenderState state { + QTextCursor { doc.get() }, + TextStyleInstance { style }, + root_url, + &style, + &outline, + }; + + state.cursor.setBlockFormat(state.text_style.standard_format); + state.cursor.setCharFormat(state.text_style.standard); + + renderRecursive(state, *body); + } + } + + + outline.endBuild(); + + return doc; +} diff --git a/src/renderers/htmlrenderer.hpp b/src/renderers/htmlrenderer.hpp new file mode 100644 index 0000000..d835236 --- /dev/null +++ b/src/renderers/htmlrenderer.hpp @@ -0,0 +1,31 @@ +#ifndef HTMLRENDERER_HPP +#define HTMLRENDERER_HPP + +#include +#include +#include +#include + +#include "documentoutlinemodel.hpp" + +#include "documentstyle.hpp" + +struct HtmlRenderer +{ + HtmlRenderer() = delete; + + //! Renders the given byte sequence into a GeminiDocument. + //! @param input The utf8 encoded input string + //! @param root_url The url that is used to resolve relative links + //! @param style The style which is used to render the document + //! @param outline The extracted outline from the document + static std::unique_ptr render( + QByteArray const & input, + QUrl const & root_url, + DocumentStyle const & style, + DocumentOutlineModel & outline, + QString & page_title + ); +}; + +#endif // HTMLRENDERER_HPP diff --git a/src/renderers/textstyleinstance.cpp b/src/renderers/textstyleinstance.cpp index 044953c..7a7822e 100644 --- a/src/renderers/textstyleinstance.cpp +++ b/src/renderers/textstyleinstance.cpp @@ -60,7 +60,6 @@ TextStyleInstance::TextStyleInstance(DocumentStyle const & themed_style) preformatted_format.setIndent(themed_style.indent_p); - heading_format.setLineHeight(themed_style.line_height_h, - QTextBlockFormat::LineDistanceHeight); + heading_format.setLineHeight(themed_style.line_height_h, QTextBlockFormat::LineDistanceHeight); heading_format.setIndent(themed_style.indent_h); } -- cgit v1.2.3