diff options
| author | Felix (xq) Queißner <git@mq32.de> | 2021-03-07 11:58:37 +0100 |
|---|---|---|
| committer | Felix Queißner <felix@ib-queissner.de> | 2021-03-08 02:23:16 +0100 |
| commit | 64e271dce2e5aca07abdca8e335b3f7cb9ba1114 (patch) | |
| tree | e3aed2f5057f4db7eccc9505125c3a539ad543c1 /src/renderers | |
| parent | 2e4bd9e8a1f0ee14fb58c91ae2d94bcb96c6cbce (diff) | |
| download | kristall-64e271dce2e5aca07abdca8e335b3f7cb9ba1114.tar.gz | |
Starts to implement new HTML renderer based on gumbo. Everything is a bit borked and only trivial documents work correctly.
Diffstat (limited to 'src/renderers')
| -rw-r--r-- | src/renderers/geminirenderer.cpp | 9 | ||||
| -rw-r--r-- | src/renderers/geminirenderer.hpp | 3 | ||||
| -rw-r--r-- | src/renderers/htmlrenderer.cpp | 447 | ||||
| -rw-r--r-- | src/renderers/htmlrenderer.hpp | 31 | ||||
| -rw-r--r-- | src/renderers/textstyleinstance.cpp | 3 |
5 files changed, 485 insertions, 8 deletions
diff --git a/src/renderers/geminirenderer.cpp b/src/renderers/geminirenderer.cpp index 6bcd3a4..3a49706 100644 --- a/src/renderers/geminirenderer.cpp +++ b/src/renderers/geminirenderer.cpp @@ -36,7 +36,7 @@ std::unique_ptr<GeminiDocument> GeminiRenderer::render( QUrl const &root_url, DocumentStyle const & themed_style, DocumentOutlineModel &outline, - QString* const page_title) + QString & page_title) { TextStyleInstance text_style { themed_style }; @@ -72,8 +72,7 @@ std::unique_ptr<GeminiDocument> GeminiRenderer::render( // Set the last line of the preformatted block to have // standard line height. QTextBlockFormat fmt = text_style.preformatted_format; - fmt.setLineHeight(themed_style.line_height_p, - QTextBlockFormat::LineDistanceHeight); + fmt.setLineHeight(themed_style.line_height_p, QTextBlockFormat::LineDistanceHeight); cursor.movePosition(QTextCursor::PreviousBlock); cursor.setBlockFormat(fmt); @@ -190,9 +189,9 @@ std::unique_ptr<GeminiDocument> GeminiRenderer::render( outline.appendH1(heading, id); // Use first heading as the page's title. - if (page_title != nullptr && page_title->isEmpty()) + if (page_title.isEmpty()) { - *page_title = heading; + page_title = heading; } // Centre the first heading. We can't use the above code block diff --git a/src/renderers/geminirenderer.hpp b/src/renderers/geminirenderer.hpp index 65fdcf1..33f8136 100644 --- a/src/renderers/geminirenderer.hpp +++ b/src/renderers/geminirenderer.hpp @@ -28,12 +28,13 @@ struct GeminiRenderer //! @param root_url The url that is used to resolve relative links //! @param style The style which is used to render the document //! @param outline The extracted outline from the document + //! @param page_title The extracted page title static std::unique_ptr<GeminiDocument> render( QByteArray const & input, QUrl const & root_url, DocumentStyle const & style, DocumentOutlineModel & outline, - QString* const page_title = nullptr + QString & page_title ); }; diff --git a/src/renderers/htmlrenderer.cpp b/src/renderers/htmlrenderer.cpp new file mode 100644 index 0000000..e1cd2da --- /dev/null +++ b/src/renderers/htmlrenderer.cpp @@ -0,0 +1,447 @@ +#include "htmlrenderer.hpp" + +#include "renderhelpers.hpp" +#include "textstyleinstance.hpp" +#include "gumbo.h" + +#include <QDebug> +#include <QTextTable> +#include <QRegularExpression> + +static void* malloc_wrapper(void*, size_t size) { return malloc(size); } + +static void free_wrapper(void*, void* ptr) { free(ptr); } + +static GumboOptions const gumbo_options = { + &malloc_wrapper, &free_wrapper, // memory management + nullptr, // user pointer + 4, // tab width + false, // stop on first error + -1, // maximum numbers of errors (-1 = infinite) + GUMBO_TAG_LAST, + GUMBO_NAMESPACE_HTML +}; + +static void destroyGumboOutput(GumboOutput * output) +{ + gumbo_destroy_output(&gumbo_options, output); +} + +static const char* find_title(const GumboNode* root) { + assert(root->type == GUMBO_NODE_ELEMENT); + if(root->v.element.children.length < 2) + return nullptr; + + const GumboVector* root_children = &root->v.element.children; + GumboNode* head = nullptr; + for (size_t i = 0; i < root_children->length; ++i) { + GumboNode* child = (GumboNode*)root_children->data[i]; + if (child->type == GUMBO_NODE_ELEMENT and child->v.element.tag == GUMBO_TAG_HEAD) { + head = child; + break; + } + } + if(head == nullptr) + return nullptr; + + GumboVector* head_children = &head->v.element.children; + for (size_t i = 0; i < head_children->length; ++i) { + GumboNode* child = (GumboNode*)head_children->data[i]; + if (child->type == GUMBO_NODE_ELEMENT and child->v.element.tag == GUMBO_TAG_TITLE) { + if (child->v.element.children.length != 1) { + return ""; + } + GumboNode* title_text = (GumboNode*)child->v.element.children.data[0]; + if(title_text->type == GUMBO_NODE_TEXT or title_text->type == GUMBO_NODE_WHITESPACE) + return title_text->v.text.text; + return nullptr; + } + } + return nullptr; +} + +struct RenderState +{ + QTextCursor cursor; + TextStyleInstance text_style; + QUrl root_url; + DocumentStyle const * style; + DocumentOutlineModel * outline; +}; + +static char const * getAttribute(GumboElement const & element, char const * attrib_name) +{ + for(size_t i = 0; i < element.attributes.length; i++) + { + auto const attrib = static_cast<GumboAttribute const *>(element.attributes.data[i]); + if(strcmp(attrib->name, attrib_name) == 0) + return attrib->value; + } + return nullptr; +} + +struct TextFormatReset +{ + QTextCursor * cursor; + + QTextCharFormat char_format; + QTextBlockFormat block_format; + + TextFormatReset(QTextCursor * cursor) : + cursor(cursor), + char_format(cursor->charFormat()), + block_format(cursor->blockFormat()) + { + + } + + TextFormatReset(TextFormatReset const &) = delete; + TextFormatReset(TextFormatReset &&) = delete; + + ~TextFormatReset() + { + this->cursor->setCharFormat(this->char_format); + this->cursor->setBlockFormat(this->block_format); + } + +}; + +// Problems: +// Style/theme elements must use a push/pop +// use instead of "replacing" styles +// Otherwise, <h1><a>Foo</a></h1> will be rendered as a link, not as a heading. +// Should be combined here. + +static void renderRecursive(RenderState & state, GumboNode const & node, int nesting = 0) +{ + auto & cursor = state.cursor; + auto & text_style = state.text_style; + switch(node.type) + { + /** Document node. v will be a GumboDocument. */ + case GUMBO_NODE_DOCUMENT: { + qWarning() << "Detected embedded document"; + } + + /** Element node. v will be a GumboElement. */ + case GUMBO_NODE_ELEMENT: { + auto const & element = node.v.element; + + TextFormatReset format_reset { &cursor }; + + // qDebug() << "begin node(" << gumbo_normalized_tagname(element.tag) << ")"; + + switch(element.tag) { + + // Stripped tags + case GUMBO_TAG_STYLE: + case GUMBO_TAG_SCRIPT: + return; + + case GUMBO_TAG_NAV: { + // TODO: Optionally strip navigation from sites + if(true) + return; + break; + } + + // Terminal tags + case GUMBO_TAG_IMG: { + // TODO: Insert link to image here + cursor.insertText("[IMG]"); + return; + } + case GUMBO_TAG_SVG: { + // TODO: Insert link to image here + cursor.insertText("[SVG]"); + return; + } + case GUMBO_TAG_BUTTON: { + // TODO: Insert link to image here + cursor.insertText("[BUTTON]"); + return; + } + case GUMBO_TAG_INPUT: { + // TODO: Insert link to image here + cursor.insertText("[INPUT]"); + return; + } + + // Paragraph-like elements: + case GUMBO_TAG_DIV: // <div> is the same as <p> for us + case GUMBO_TAG_P: { + // cursor.insertBlock(); + break; + } + case GUMBO_TAG_H1: { + // cursor.insertBlock(); + cursor.setBlockFormat(text_style.heading_format); + cursor.setCharFormat(text_style.standard_h1); + break; + } + case GUMBO_TAG_H2: { + // cursor.insertBlock(); + cursor.setBlockFormat(text_style.heading_format); + cursor.setCharFormat(text_style.standard_h2); + + break; + } + case GUMBO_TAG_H3: { + // cursor.insertBlock(); + cursor.setBlockFormat(text_style.heading_format); + cursor.setCharFormat(text_style.standard_h3); + break; + } + + case GUMBO_TAG_PRE: { + // cursor.insertBlock(); + cursor.setBlockFormat(text_style.preformatted_format); + cursor.setCharFormat(text_style.preformatted); + break; + } + + case GUMBO_TAG_OL: + case GUMBO_TAG_UL: { + // cursor.insertBlock(); + + if(element.tag == GUMBO_TAG_OL) { + auto fmt = text_style.list_format; + fmt.setStyle(QTextListFormat::ListDecimal); + fmt.setNumberPrefix(""); + fmt.setNumberSuffix("."); + cursor.createList(fmt); + } + else { + cursor.createList(text_style.list_format); + } + break; + } + case GUMBO_TAG_LI: { + break; + } + + + // Text modification elements: + case GUMBO_TAG_SPAN: { + // This usually has a style change, but we ignore that completly + break; + } + case GUMBO_TAG_BR: { + cursor.insertText("\n"); + break; + } + case GUMBO_TAG_I: { + auto fmt = cursor.charFormat(); + fmt.setFontItalic(true); + cursor.setCharFormat(fmt); + break; + } + case GUMBO_TAG_B: { + auto fmt = cursor.charFormat(); + fmt.setFontWeight(QFont::Bold); + cursor.setCharFormat(fmt); + break; + } + case GUMBO_TAG_U: { + auto fmt = cursor.charFormat(); + fmt.setFontUnderline(true); + cursor.setCharFormat(fmt); + break; + } + case GUMBO_TAG_A: { + char const * anchor = getAttribute(element, "href"); + if(anchor == nullptr) { + anchor = "#"; + } + + auto fmt = text_style.standard_link; + fmt.setAnchor(true); + fmt.setAnchorHref(QString::fromUtf8(anchor)); + cursor.setBlockFormat(text_style.link_format); + cursor.setCharFormat(fmt); + break; + } + case GUMBO_TAG_BLOCKQUOTE: { + QTextTable *table = cursor.insertTable(1, 1,text_style.blockquote_tableformat); + cursor.setBlockFormat(text_style.blockquote_format); + QTextTableCell cell = table->cellAt(0, 0); + cell.setFormat(text_style.blockquote); + + cursor.setCharFormat(text_style.blockquote); + + break; + } + default: + qDebug() << "unhandled tag:" << gumbo_normalized_tagname(element.tag); + break; + } + + for (size_t i = 0; i < element.children.length; ++i) { + GumboNode* child = (GumboNode*)element.children.data[i]; + renderRecursive(state, *child, nesting + 1); + } + + switch(element.tag) { + // case GUMBO_TAG_PRE: { +// // Set the last line of the preformatted block to have +// // standard line height. +// QTextBlockFormat fmt = cursor.blockFormat(); +// fmt.setLineHeight(state.style->line_height_p, QTextBlockFormat::LineDistanceHeight); +// cursor.movePosition(QTextCursor::PreviousBlock); +// cursor.setBlockFormat(fmt); + +// cursor.movePosition(QTextCursor::NextBlock); +// break; +// } + + // Requires closing block + case GUMBO_TAG_PRE: + case GUMBO_TAG_P: + case GUMBO_TAG_DIV: + case GUMBO_TAG_H1: + case GUMBO_TAG_H2: + case GUMBO_TAG_H3: + cursor.insertBlock(); + break; + + case GUMBO_TAG_OL: + case GUMBO_TAG_UL: + // cursor.insertBlock(); + break; + + case GUMBO_TAG_LI: + // Terminate the <li> by pressing "enter" + cursor.insertBlock(); + break; + + case GUMBO_TAG_BLOCKQUOTE: + cursor.deletePreviousChar(); + cursor.movePosition(QTextCursor::NextBlock); + break; + + default: break; + } + + // qDebug() << "end node(" << gumbo_normalized_tagname(element.tag) << ")"; + + break; + } + + /** Text node. v will be a GumboText. */ + case GUMBO_NODE_TEXT: { + auto const & text = node.v.text; + + auto contents = QString::fromUtf8(text.text); + // qDebug() << contents; + + QRegularExpression regex { "\\s+", QRegularExpression::DotMatchesEverythingOption }; + + // TODO: This is not quite right, but QTextCursor::inserText + // will insert spurious blocks when a "\n" is encountered. + state.cursor.insertText(contents.replace(regex, " ")); + break; + } + + /** CDATA node. v will be a GumboText. */ + case GUMBO_NODE_CDATA: { + auto const & text = node.v.text; + + auto const contents = QString::fromUtf8(text.text); + + // TODO: This is not quite right, but QTextCursor::inserText + // will insert spurious blocks when a "\n" is encountered. + state.cursor.insertText(contents.trimmed()); + break; + } + + /** Comment node. v will be a GumboText, excluding comment delimiters. */ + case GUMBO_NODE_COMMENT: { + // qDebug() << "comment(" << ")"; + break; + } + + /** Text node, where all contents is whitespace. v will be a GumboText. */ + case GUMBO_NODE_WHITESPACE: { + // qDebug() << "whitespace(" << ")"; + break; + } + + /** Template node. This is separate from GUMBO_NODE_ELEMENT because many + * client libraries will want to ignore the contents of template nodes, as + * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing + * here, while clients that want to include template contents should also + * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */ + case GUMBO_NODE_TEMPLATE: { + qDebug() << "template(" << "???" << ")"; + break; + } + } +} + +std::unique_ptr<QTextDocument> HtmlRenderer::render( + QByteArray const &input, + QUrl const & root_url, + DocumentStyle const & style, + DocumentOutlineModel & outline, + QString & page_title) +{ + std::unique_ptr<GumboOutput, decltype(&destroyGumboOutput)> gumbo_output { + gumbo_parse_with_options(&gumbo_options, input.data(), input.length()), + &destroyGumboOutput, + }; + + if(gumbo_output->errors.length > 0) { + qDebug() << "Parsing the html document yielded" << gumbo_output->errors.length << "errors!"; + } + + if(gumbo_output->root->type != GUMBO_NODE_ELEMENT) { + qWarning() << "html document has no proper root node!"; + return nullptr; + } + + auto doc = std::make_unique<QTextDocument>(); + renderhelpers::setPageMargins(doc.get(), style.margin_h, style.margin_v); + doc->setIndentWidth(style.indent_size); + + outline.beginBuild(); + + // Find page title + { + const char* title = find_title(gumbo_output->root); + if(title != nullptr) { + page_title = QString::fromUtf8(title); + } + } + + { + GumboVector const * const root_children = &gumbo_output->root->v.element.children; + GumboNode* body = nullptr; + for (size_t i = 0; i < root_children->length; ++i) { + GumboNode* child = (GumboNode*)root_children->data[i]; + if (child->type == GUMBO_NODE_ELEMENT and child->v.element.tag == GUMBO_TAG_BODY) { + body = child; + break; + } + } + if(body != nullptr) + { + RenderState state { + QTextCursor { doc.get() }, + TextStyleInstance { style }, + root_url, + &style, + &outline, + }; + + state.cursor.setBlockFormat(state.text_style.standard_format); + state.cursor.setCharFormat(state.text_style.standard); + + renderRecursive(state, *body); + } + } + + + outline.endBuild(); + + return doc; +} diff --git a/src/renderers/htmlrenderer.hpp b/src/renderers/htmlrenderer.hpp new file mode 100644 index 0000000..d835236 --- /dev/null +++ b/src/renderers/htmlrenderer.hpp @@ -0,0 +1,31 @@ +#ifndef HTMLRENDERER_HPP +#define HTMLRENDERER_HPP + +#include <memory> +#include <QTextDocument> +#include <QColor> +#include <QSettings> + +#include "documentoutlinemodel.hpp" + +#include "documentstyle.hpp" + +struct HtmlRenderer +{ + HtmlRenderer() = delete; + + //! Renders the given byte sequence into a GeminiDocument. + //! @param input The utf8 encoded input string + //! @param root_url The url that is used to resolve relative links + //! @param style The style which is used to render the document + //! @param outline The extracted outline from the document + static std::unique_ptr<QTextDocument> render( + QByteArray const & input, + QUrl const & root_url, + DocumentStyle const & style, + DocumentOutlineModel & outline, + QString & page_title + ); +}; + +#endif // HTMLRENDERER_HPP diff --git a/src/renderers/textstyleinstance.cpp b/src/renderers/textstyleinstance.cpp index 044953c..7a7822e 100644 --- a/src/renderers/textstyleinstance.cpp +++ b/src/renderers/textstyleinstance.cpp @@ -60,7 +60,6 @@ TextStyleInstance::TextStyleInstance(DocumentStyle const & themed_style) preformatted_format.setIndent(themed_style.indent_p); - heading_format.setLineHeight(themed_style.line_height_h, - QTextBlockFormat::LineDistanceHeight); + heading_format.setLineHeight(themed_style.line_height_h, QTextBlockFormat::LineDistanceHeight); heading_format.setIndent(themed_style.indent_h); } |
