Search code examples
phpdomdocumentpreg-replace-callback

set tags in html using domdocument and preg_replace_callback


I try to replace words that are in my dictionary of terminology with an (html)anchor so it gets a tooltip. I get the replace-part done, but I just can't get it back in the DomDocument object.

I've made a recursive function that iterates the DOM, it iterates every childnode, searching for the word in my dictionary and replacing it with an anchor.

I've been using this with an ordinary preg_match on HTML, but that just runs into problems.. when HTML gets complex

The recursive function:

$terms = array(
   'example'=>'explanation about example'
);

function iterate_html($doc, $original_doc = null)
    {
    global $terms;

        if(is_null($original_doc)) {
            self::iterate_html($doc, $doc);
        }

        foreach($doc->childNodes as $childnode)
        {
            $children = $childnode->childNodes;
            if($children) {
                self::iterate_html($childnode);
            } else {

                $regexes = '~\b' . implode('\b|\b',array_keys($terms)) . '\b~i';
                $new_nodevalue = preg_replace_callback($regexes, function($matches) {
                    $doc = new DOMDocument();

                    $anchor = $doc->createElement('a', $matches[0]);
                    $anchor->setAttribute('class', 'text-info');
                    $anchor->setAttribute('data-toggle', 'tooltip');
                    $anchor->setAttribute('data-original-title', $terms[strtolower($matches[0])]);

                    return $doc->saveXML($anchor);

                }, $childnode->nodeValue);



                $dom = new DOMDocument();
                $template = $dom->createDocumentFragment();
                $template->appendXML($new_nodevalue);

                $original_doc->importNode($template->childNodes, true);
                $childnode->parentNode->replaceChild($template, $childnode);
            }
        }
    }

echo iterate_html('this is just some example text.');

I expect the result to be:

this is just some <a class="text-info" data-toggle="tooltip" data-original-title="explanation about example">example</a> text

Solution

  • I don't think building a recursive function to walk the DOM is usefull when you can use an XPath query. Also, I'm not sure that preg_replace_callback is an adapted function for this case. I prefer to use preg_split. Here is an example:

    $html = 'this is just some example text.';
    
    $terms = array(
       'example'=>'explanation about example'
    );
    
    // sort by reverse order of key size
    // (to be sure that the longest string always wins instead of the first in the pattern)
    
    uksort($terms, function ($a, $b) {
        $diff = mb_strlen($b) - mb_strlen($a);
    
        return ($diff) ? $diff : strcmp($a, $b);
    });
    
    // build the pattern inside a capture group (to have delimiters in the results with the PREG_SPLIT_DELIM_CAPTURE option)
    $pattern = '~\b(' . implode('|', array_map(function($i) { return preg_quote($i, '~'); }, array_keys($terms))) . ')\b~i';
    
    // prevent eventual html errors to be displayed
    $libxmlInternalErrors = libxml_use_internal_errors(true);
    
    // determine if the html string have a root html element already, if not add a fake root.
    $dom = new DOMDocument;
    $dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
    $fakeRootElement = false;
    
    if ( $dom->documentElement->nodeName !== 'html' ) {
        $dom->loadHTML("<div>$html</div>", LIBXML_HTML_NODEFDTD | LIBXML_HTML_NOIMPLIED);
        $fakeRootElement = true;
    }
    
    libxml_use_internal_errors($libxmlInternalErrors);
    
    // find all text nodes (not already included in a link or between other unwanted tags)
    $xp = new DOMXPath($dom);
    $textNodes = $xp->query('//text()[not(ancestor::a)][not(ancestor::style)][not(ancestor::script)]');
    
    // replacement
    foreach ($textNodes as $textNode) {
        $parts = preg_split($pattern, $textNode->nodeValue, -1, PREG_SPLIT_DELIM_CAPTURE);
        $fragment = $dom->createDocumentFragment();
        foreach ($parts as $k=>$part) {
            if ($k&1) {
                $anchor = $dom->createElement('a', $part);
                $anchor->setAttribute('class', 'text-info');
                $anchor->setAttribute('data-toggle', 'tooltip');
                $anchor->setAttribute('data-original-title', $terms[strtolower($part)]);
                $fragment->appendChild($anchor);
            } else {
                $fragment->appendChild($dom->createTextNode($part));
            }
        }
        $textNode->parentNode->replaceChild($fragment, $textNode);
    }
    
    
    // building of the result string
    $result = '';
    
    if ( $fakeRootElement ) {
        foreach ($dom->documentElement->childNodes as $childNode) {
            $result .= $dom->saveHTML($childNode);
        }
    } else {
        $result = $dom->saveHTML();
    }
    
    echo $result;
    

    demo

    Feel free to put that into one or more functions/methods, but keep in mind that this kind of editing has a non-neglictable weight and should be used each time the html is edited (and not each time the html is displayed).