diff --git a/app/Entities/Tools/PageContent.php b/app/Entities/Tools/PageContent.php index 3e75bd5bb..99070ae89 100644 --- a/app/Entities/Tools/PageContent.php +++ b/app/Entities/Tools/PageContent.php @@ -11,6 +11,7 @@ use BookStack\Uploads\ImageRepo; use BookStack\Uploads\ImageService; use BookStack\Util\HtmlContentFilter; use BookStack\Util\HtmlDocument; +use Closure; use DOMElement; use DOMNode; use DOMNodeList; @@ -275,21 +276,65 @@ class PageContent */ public function render(bool $blankIncludes = false): string { - $content = $this->page->html ?? ''; + $html = $this->page->html ?? ''; + + if (empty($html)) { + return $html; + } + + $doc = new HtmlDocument($html); + $contentProvider = $this->getContentProviderClosure($blankIncludes); + $parser = new PageIncludeParser($doc, $contentProvider); + + $nodesAdded = 1; + for ($includeDepth = 0; $includeDepth < 3 && $nodesAdded !== 0; $includeDepth++) { + $nodesAdded = $parser->parse(); + } + + if ($includeDepth > 1) { + $idMap = []; + $changeMap = []; + $this->updateIdsRecursively($doc->getBody(), 0, $idMap, $changeMap); + } if (!config('app.allow_content_scripts')) { - $content = HtmlContentFilter::removeScripts($content); + HtmlContentFilter::removeScriptsFromDocument($doc); } - if ($blankIncludes) { - $content = $this->blankPageIncludes($content); - } else { - for ($includeDepth = 0; $includeDepth < 3; $includeDepth++) { - $content = $this->parsePageIncludes($content); + return $doc->getBodyInnerHtml(); + } + + /** + * Get the closure used to fetch content for page includes. + */ + protected function getContentProviderClosure(bool $blankIncludes): Closure + { + $contextPage = $this->page; + + return function (PageIncludeTag $tag) use ($blankIncludes, $contextPage): PageIncludeContent { + if ($blankIncludes) { + return PageIncludeContent::fromHtmlAndTag('', $tag); } - } - return $content; + $matchedPage = Page::visible()->find($tag->getPageId()); + $content = PageIncludeContent::fromHtmlAndTag($matchedPage->html ?? '', $tag); + + if (Theme::hasListeners(ThemeEvents::PAGE_INCLUDE_PARSE)) { + $themeReplacement = Theme::dispatch( + ThemeEvents::PAGE_INCLUDE_PARSE, + $tag->tagContent, + $content->toHtml(), + clone $contextPage, + $matchedPage ? (clone $matchedPage) : null, + ); + + if ($themeReplacement !== null) { + $content = PageIncludeContent::fromInlineHtml(strval($themeReplacement)); + } + } + + return $content; + }; } /** @@ -337,83 +382,4 @@ class PageContent return $tree->toArray(); } - - /** - * Remove any page include tags within the given HTML. - */ - protected function blankPageIncludes(string $html): string - { - return preg_replace("/{{@\s?([0-9].*?)}}/", '', $html); - } - - /** - * Parse any include tags "{{@#section}}" to be part of the page. - */ - protected function parsePageIncludes(string $html): string - { - $matches = []; - preg_match_all("/{{@\s?([0-9].*?)}}/", $html, $matches); - - foreach ($matches[1] as $index => $includeId) { - $fullMatch = $matches[0][$index]; - $splitInclude = explode('#', $includeId, 2); - - // Get page id from reference - $pageId = intval($splitInclude[0]); - if (is_nan($pageId)) { - continue; - } - - // Find page to use, and default replacement to empty string for non-matches. - /** @var ?Page $matchedPage */ - $matchedPage = Page::visible()->find($pageId); - $replacement = ''; - - if ($matchedPage && count($splitInclude) === 1) { - // If we only have page id, just insert all page html and continue. - $replacement = $matchedPage->html; - } elseif ($matchedPage && count($splitInclude) > 1) { - // Otherwise, if our include tag defines a section, load that specific content - $innerContent = $this->fetchSectionOfPage($matchedPage, $splitInclude[1]); - $replacement = trim($innerContent); - } - - $themeReplacement = Theme::dispatch( - ThemeEvents::PAGE_INCLUDE_PARSE, - $includeId, - $replacement, - clone $this->page, - $matchedPage ? (clone $matchedPage) : null, - ); - - // Perform the content replacement - $html = str_replace($fullMatch, $themeReplacement ?? $replacement, $html); - } - - return $html; - } - - /** - * Fetch the content from a specific section of the given page. - */ - protected function fetchSectionOfPage(Page $page, string $sectionId): string - { - $topLevelTags = ['table', 'ul', 'ol', 'pre']; - $doc = new HtmlDocument($page->html); - - // Search included content for the id given and blank out if not exists. - $matchingElem = $doc->getElementById($sectionId); - if ($matchingElem === null) { - return ''; - } - - // Otherwise replace the content with the found content - // Checks if the top-level wrapper should be included by matching on tag types - $isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags); - if ($isTopLevel) { - return $doc->getNodeOuterHtml($matchingElem); - } - - return $doc->getNodeInnerHtml($matchingElem); - } } diff --git a/app/Entities/Tools/PageIncludeContent.php b/app/Entities/Tools/PageIncludeContent.php new file mode 100644 index 000000000..7c4f943c8 --- /dev/null +++ b/app/Entities/Tools/PageIncludeContent.php @@ -0,0 +1,85 @@ +getSectionId(); + if (!$sectionId) { + $contents = [...$doc->getBodyChildren()]; + return new self($contents, false); + } + + $section = $doc->getElementById($sectionId); + if (!$section) { + return new self([], true); + } + + $isTopLevel = in_array(strtolower($section->nodeName), static::$topLevelTags); + $contents = $isTopLevel ? [$section] : [...$section->childNodes]; + return new self($contents, !$isTopLevel); + } + + public static function fromInlineHtml(string $html): self + { + if (empty($html)) { + return new self([], true); + } + + $doc = new HtmlDocument($html); + + return new self([...$doc->getBodyChildren()], true); + } + + public function isInline(): bool + { + return $this->isInline; + } + + public function isEmpty(): bool + { + return empty($this->contents); + } + + /** + * @return DOMNode[] + */ + public function toDomNodes(): array + { + return $this->contents; + } + + public function toHtml(): string + { + $html = ''; + + foreach ($this->contents as $content) { + $html .= $content->ownerDocument->saveHTML($content); + } + + return $html; + } +} diff --git a/app/Entities/Tools/PageIncludeParser.php b/app/Entities/Tools/PageIncludeParser.php new file mode 100644 index 000000000..f1fbfba03 --- /dev/null +++ b/app/Entities/Tools/PageIncludeParser.php @@ -0,0 +1,220 @@ +locateAndIsolateIncludeTags(); + + foreach ($tags as $tag) { + /** @var PageIncludeContent $content */ + $content = $this->pageContentForId->call($this, $tag); + + if (!$content->isInline()) { + $parentP = $this->getParentParagraph($tag->domNode); + $isWithinParentP = $parentP === $tag->domNode->parentNode; + if ($parentP && $isWithinParentP) { + $this->splitNodeAtChildNode($tag->domNode->parentNode, $tag->domNode); + } else if ($parentP) { + $this->moveTagNodeToBesideParent($tag, $parentP); + } + } + + $replacementNodes = $content->toDomNodes(); + $nodesAdded += count($replacementNodes); + $this->replaceNodeWithNodes($tag->domNode, $replacementNodes); + } + + $this->cleanup(); + + return $nodesAdded; + } + + /** + * Locate include tags within the given document, isolating them to their + * own nodes in the DOM for future targeted manipulation. + * @return PageIncludeTag[] + */ + protected function locateAndIsolateIncludeTags(): array + { + $includeHosts = $this->doc->queryXPath("//*[text()[contains(., '{{@')]]"); + $includeTags = []; + + /** @var DOMNode $node */ + /** @var DOMNode $childNode */ + foreach ($includeHosts as $node) { + foreach ($node->childNodes as $childNode) { + if ($childNode->nodeName === '#text') { + array_push($includeTags, ...$this->splitTextNodesAtTags($childNode)); + } + } + } + + return $includeTags; + } + + /** + * Takes a text DOMNode and splits its text content at include tags + * into multiple text nodes within the original parent. + * Returns found PageIncludeTag references. + * @return PageIncludeTag[] + */ + protected function splitTextNodesAtTags(DOMNode $textNode): array + { + $includeTags = []; + $text = $textNode->textContent; + preg_match_all(static::$includeTagRegex, $text, $matches, PREG_OFFSET_CAPTURE); + + $currentOffset = 0; + foreach ($matches[0] as $index => $fullTagMatch) { + $tagOuterContent = $fullTagMatch[0]; + $tagInnerContent = $matches[1][$index][0]; + $tagStartOffset = $fullTagMatch[1]; + + if ($currentOffset < $tagStartOffset) { + $previousText = substr($text, $currentOffset, $tagStartOffset - $currentOffset); + $textNode->parentNode->insertBefore(new DOMText($previousText), $textNode); + } + + $node = $textNode->parentNode->insertBefore(new DOMText($tagOuterContent), $textNode); + $includeTags[] = new PageIncludeTag($tagInnerContent, $node); + $currentOffset = $tagStartOffset + strlen($tagOuterContent); + } + + if ($currentOffset > 0) { + $textNode->textContent = substr($text, $currentOffset); + } + + return $includeTags; + } + + /** + * Replace the given node with all those in $replacements + * @param DOMNode[] $replacements + */ + protected function replaceNodeWithNodes(DOMNode $toReplace, array $replacements): void + { + /** @var DOMDocument $targetDoc */ + $targetDoc = $toReplace->ownerDocument; + + foreach ($replacements as $replacement) { + if ($replacement->ownerDocument !== $targetDoc) { + $replacement = $targetDoc->importNode($replacement, true); + } + + $toReplace->parentNode->insertBefore($replacement, $toReplace); + } + + $toReplace->parentNode->removeChild($toReplace); + } + + /** + * Move a tag node to become a sibling of the given parent. + * Will attempt to guess a position based upon the tag content within the parent. + */ + protected function moveTagNodeToBesideParent(PageIncludeTag $tag, DOMNode $parent): void + { + $parentText = $parent->textContent; + $tagPos = strpos($parentText, $tag->tagContent); + $before = $tagPos < (strlen($parentText) / 2); + $this->toCleanup[] = $tag->domNode->parentNode; + + if ($before) { + $parent->parentNode->insertBefore($tag->domNode, $parent); + } else { + $parent->parentNode->insertBefore($tag->domNode, $parent->nextSibling); + } + } + + /** + * Splits the given $parentNode at the location of the $domNode within it. + * Attempts replicate the original $parentNode, moving some of their parent + * children in where needed, before adding the $domNode between. + */ + protected function splitNodeAtChildNode(DOMElement $parentNode, DOMNode $domNode): void + { + $children = [...$parentNode->childNodes]; + $splitPos = array_search($domNode, $children, true); + if ($splitPos === false) { + $splitPos = count($children) - 1; + } + + $parentClone = $parentNode->cloneNode(); + $parentNode->parentNode->insertBefore($parentClone, $parentNode); + $parentClone->removeAttribute('id'); + + /** @var DOMNode $child */ + for ($i = 0; $i < $splitPos; $i++) { + $child = $children[$i]; + $parentClone->appendChild($child); + } + + $parentNode->parentNode->insertBefore($domNode, $parentNode); + + $this->toCleanup[] = $parentNode; + $this->toCleanup[] = $parentClone; + } + + /** + * Get the parent paragraph of the given node, if existing. + */ + protected function getParentParagraph(DOMNode $parent): ?DOMNode + { + do { + if (strtolower($parent->nodeName) === 'p') { + return $parent; + } + + $parent = $parent->parentNode; + } while ($parent !== null); + + return null; + } + + /** + * Cleanup after a parse operation. + * Removes stranded elements we may have left during the parse. + */ + protected function cleanup(): void + { + foreach ($this->toCleanup as $element) { + $element->normalize(); + while ($element->parentNode && !$element->hasChildNodes()) { + $parent = $element->parentNode; + $parent->removeChild($element); + $element = $parent; + } + } + } +} diff --git a/app/Entities/Tools/PageIncludeTag.php b/app/Entities/Tools/PageIncludeTag.php new file mode 100644 index 000000000..05a532fb2 --- /dev/null +++ b/app/Entities/Tools/PageIncludeTag.php @@ -0,0 +1,30 @@ +tagContent, 2)[0])); + } + + /** + * Get the section ID that this tag references (if any) + */ + public function getSectionId(): string + { + return trim(explode('#', $this->tagContent, 2)[1] ?? ''); + } +} diff --git a/app/Theming/CustomHtmlHeadContentProvider.php b/app/Theming/CustomHtmlHeadContentProvider.php index 041e5d025..95d9ff5ad 100644 --- a/app/Theming/CustomHtmlHeadContentProvider.php +++ b/app/Theming/CustomHtmlHeadContentProvider.php @@ -50,7 +50,7 @@ class CustomHtmlHeadContentProvider $hash = md5($content); return $this->cache->remember('custom-head-export:' . $hash, 86400, function () use ($content) { - return HtmlContentFilter::removeScripts($content); + return HtmlContentFilter::removeScriptsFromHtmlString($content); }); } diff --git a/app/Theming/ThemeEvents.php b/app/Theming/ThemeEvents.php index 9e14707de..3d8cd4167 100644 --- a/app/Theming/ThemeEvents.php +++ b/app/Theming/ThemeEvents.php @@ -2,8 +2,6 @@ namespace BookStack\Theming; -use BookStack\Entities\Models\Page; - /** * The ThemeEvents used within BookStack. * @@ -93,8 +91,8 @@ class ThemeEvents * * @param string $tagReference * @param string $replacementHTML - * @param Page $currentPage - * @param ?Page $referencedPage + * @param \BookStack\Entities\Models\Page $currentPage + * @param ?\BookStack\Entities\Models\Page $referencedPage */ const PAGE_INCLUDE_PARSE = 'page_include_parse'; diff --git a/app/Theming/ThemeService.php b/app/Theming/ThemeService.php index 31a7d3c64..0c2526536 100644 --- a/app/Theming/ThemeService.php +++ b/app/Theming/ThemeService.php @@ -48,6 +48,14 @@ class ThemeService return null; } + /** + * Check if there are listeners registered for the given event name. + */ + public function hasListeners(string $event): bool + { + return count($this->listeners[$event] ?? []) > 0; + } + /** * Register a new custom artisan command to be available. */ diff --git a/app/Util/HtmlContentFilter.php b/app/Util/HtmlContentFilter.php index 2dbb34086..758591729 100644 --- a/app/Util/HtmlContentFilter.php +++ b/app/Util/HtmlContentFilter.php @@ -9,16 +9,10 @@ use DOMNodeList; class HtmlContentFilter { /** - * Remove all the script elements from the given HTML. + * Remove all the script elements from the given HTML document. */ - public static function removeScripts(string $html): string + public static function removeScriptsFromDocument(HtmlDocument $doc) { - if (empty($html)) { - return $html; - } - - $doc = new HtmlDocument($html); - // Remove standard script tags $scriptElems = $doc->queryXPath('//script'); static::removeNodes($scriptElems); @@ -53,6 +47,19 @@ class HtmlContentFilter // Remove 'on*' attributes $onAttributes = $doc->queryXPath('//@*[starts-with(name(), \'on\')]'); static::removeAttributes($onAttributes); + } + + /** + * Remove scripts from the given HTML string. + */ + public static function removeScriptsFromHtmlString(string $html): string + { + if (empty($html)) { + return $html; + } + + $doc = new HtmlDocument($html); + static::removeScriptsFromDocument($doc); return $doc->getBodyInnerHtml(); } diff --git a/tests/Entity/PageContentTest.php b/tests/Entity/PageContentTest.php index d8845fe12..958598fda 100644 --- a/tests/Entity/PageContentTest.php +++ b/tests/Entity/PageContentTest.php @@ -8,7 +8,7 @@ use Tests\TestCase; class PageContentTest extends TestCase { - protected $base64Jpeg = '/9j/2wBDAAMCAgICAgMCAgIDAwMDBAYEBAQEBAgGBgUGCQgKCgkICQkKDA8MCgsOCwkJDRENDg8QEBEQCgwSExIQEw8QEBD/yQALCAABAAEBAREA/8wABgAQEAX/2gAIAQEAAD8A0s8g/9k='; + protected string $base64Jpeg = '/9j/2wBDAAMCAgICAgMCAgIDAwMDBAYEBAQEBAgGBgUGCQgKCgkICQkKDA8MCgsOCwkJDRENDg8QEBEQCgwSExIQEw8QEBD/yQALCAABAAEBAREA/8wABgAQEAX/2gAIAQEAAD8A0s8g/9k='; public function test_page_includes() { @@ -57,38 +57,6 @@ class PageContentTest extends TestCase $this->assertEquals('', $page->text); } - public function test_page_includes_do_not_break_tables() - { - $page = $this->entities->page(); - $secondPage = $this->entities->page(); - - $content = '
test
'; - $secondPage->html = $content; - $secondPage->save(); - - $page->html = "{{@{$secondPage->id}#table}}"; - $page->save(); - - $pageResp = $this->asEditor()->get($page->getUrl()); - $pageResp->assertSee($content, false); - } - - public function test_page_includes_do_not_break_code() - { - $page = $this->entities->page(); - $secondPage = $this->entities->page(); - - $content = '
var cat = null;
'; - $secondPage->html = $content; - $secondPage->save(); - - $page->html = "{{@{$secondPage->id}#bkmrk-code}}"; - $page->save(); - - $pageResp = $this->asEditor()->get($page->getUrl()); - $pageResp->assertSee($content, false); - } - public function test_page_includes_rendered_on_book_export() { $page = $this->entities->page(); @@ -120,6 +88,19 @@ class PageContentTest extends TestCase $this->withHtml($pageResp)->assertElementNotContains('#bkmrk-test', 'Hello Barry Hello Barry Hello Barry Hello Barry Hello Barry ' . $tag); } + public function test_page_includes_to_nonexisting_pages_does_not_error() + { + $page = $this->entities->page(); + $missingId = Page::query()->max('id') + 1; + $tag = "{{@{$missingId}}}"; + $page->html = '

Hello Barry ' . $tag . '

'; + $page->save(); + + $pageResp = $this->asEditor()->get($page->getUrl()); + $pageResp->assertOk(); + $pageResp->assertSee('Hello Barry'); + } + public function test_page_content_scripts_removed_by_default() { $this->asEditor(); diff --git a/tests/Unit/PageIncludeParserTest.php b/tests/Unit/PageIncludeParserTest.php new file mode 100644 index 000000000..83fded436 --- /dev/null +++ b/tests/Unit/PageIncludeParserTest.php @@ -0,0 +1,240 @@ +runParserTest( + '

{{@45#content}}

', + ['45' => '

Testing

'], + '

Testing

', + ); + } + + public function test_simple_inline_text_with_existing_siblings() + { + $this->runParserTest( + '

{{@45#content}} Hithere!

', + ['45' => '

Testing

'], + '

Testing Hithere!

', + ); + } + + public function test_simple_inline_text_within_other_text() + { + $this->runParserTest( + '

Hello {{@45#content}}there!

', + ['45' => '

Testing

'], + '

Hello Testingthere!

', + ); + } + + public function test_complex_inline_text_within_other_text() + { + $this->runParserTest( + '

Hello {{@45#content}}there!

', + ['45' => '

Testing withsomeextratags

'], + '

Hello Testing withsomeextratagsthere!

', + ); + } + + public function test_block_content_types() + { + $inputs = [ + '
Text
', + '', + '
  1. Item A
', + '
Code
', + ]; + + foreach ($inputs as $input) { + $this->runParserTest( + '

A{{@45#content}}B

', + ['45' => $input], + '

A

' . $input . '

B

', + ); + } + } + + public function test_block_content_nested_origin_gets_placed_before() + { + $this->runParserTest( + '

A {{@45#content}} there!

', + ['45' => '
Testing
'], + '
Testing

A there!

', + ); + } + + public function test_block_content_nested_origin_gets_placed_after() + { + $this->runParserTest( + '

Some really good {{@45#content}} there!

', + ['45' => '
Testing
'], + '

Some really good there!

Testing
', + ); + } + + public function test_block_content_in_shallow_origin_gets_split() + { + $this->runParserTest( + '

Some really good {{@45#content}} there!

', + ['45' => '
doggos
'], + '

Some really good

doggos

there!

', + ); + } + + public function test_block_content_in_shallow_origin_split_does_not_duplicate_id() + { + $this->runParserTest( + '

Some really good {{@45#content}} there!

', + ['45' => '
doggos
'], + '

Some really good

doggos

there!

', + ); + } + + public function test_block_content_in_shallow_origin_does_not_leave_empty_nodes() + { + $this->runParserTest( + '

{{@45#content}}

', + ['45' => '
doggos
'], + '
doggos
', + ); + } + + public function test_block_content_in_allowable_parent_element() + { + $this->runParserTest( + '
{{@45#content}}
', + ['45' => '
doggos
'], + '
doggos
', + ); + } + + public function test_block_content_in_paragraph_origin_with_allowable_grandparent() + { + $this->runParserTest( + '

{{@45#content}}

', + ['45' => '
doggos
'], + '
doggos
', + ); + } + + public function test_block_content_in_paragraph_origin_with_allowable_grandparent_with_adjacent_content() + { + $this->runParserTest( + '

Cute {{@45#content}} over there!

', + ['45' => '
doggos
'], + '

Cute

doggos

over there!

', + ); + } + + public function test_block_content_in_child_within_paragraph_origin_with_allowable_grandparent_with_adjacent_content() + { + $this->runParserTest( + '

Cute {{@45#content}} over there!

', + ['45' => '
doggos
'], + '
doggos

Cute over there!

', + ); + } + + public function test_block_content_in_paragraph_origin_within_details() + { + $this->runParserTest( + '

{{@45#content}}

', + ['45' => '
doggos
'], + '
doggos
', + ); + } + + public function test_simple_whole_document() + { + $this->runParserTest( + '

{{@45}}

', + ['45' => '

Testing

'], + '

Testing

', + ); + } + + public function test_multi_source_elem_whole_document() + { + $this->runParserTest( + '

{{@45}}

', + ['45' => '

Testing

This
'], + '

Testing

This
', + ); + } + + public function test_multi_source_elem_whole_document_with_shared_content_origin() + { + $this->runParserTest( + '

This is {{@45}} some text

', + ['45' => '

Testing

This
'], + '

This is

Testing

This

some text

', + ); + } + + public function test_multi_source_elem_whole_document_with_nested_content_origin() + { + $this->runParserTest( + '

{{@45}}

', + ['45' => '

Testing

This
'], + '

Testing

This
', + ); + } + + public function test_multiple_tags_in_same_origin_with_inline_content() + { + $this->runParserTest( + '

This {{@45#content}}{{@45#content}} content is {{@45#content}}

', + ['45' => '

inline

'], + '

This inlineinline content is inline

', + ); + } + + public function test_multiple_tags_in_same_origin_with_block_content() + { + $this->runParserTest( + '

This {{@45#content}}{{@45#content}} content is {{@45#content}}

', + ['45' => '
block
'], + '

This

block
block

content is

block
', + ); + } + + public function test_multiple_tags_in_differing_origin_levels_with_block_content() + { + $this->runParserTest( + '

This {{@45#content}} content is {{@45#content}}

{{@45#content}}
', + ['45' => '
block
'], + '
block

This content is

block
block
', + ); + } + + public function test_multiple_tags_in_shallow_origin_with_multi_block_content() + { + $this->runParserTest( + '

{{@45}}C{{@45}}

{{@45}}{{@45}}
', + ['45' => '

A

B

'], + '

A

B

C

A

B

A

B

A

B

', + ); + } + + protected function runParserTest(string $html, array $contentById, string $expected): void + { + $doc = new HtmlDocument($html); + $parser = new PageIncludeParser($doc, function (PageIncludeTag $tag) use ($contentById): PageIncludeContent { + $html = $contentById[strval($tag->getPageId())] ?? ''; + return PageIncludeContent::fromHtmlAndTag($html, $tag); + }); + + $parser->parse(); + $this->assertEquals($expected, $doc->getBodyInnerHtml()); + } +}