I'm trying to do extracting and replacing equivalents of JavaScript DocumentFragments through jsoup DOM model.
Does anyone have some ready to use code to emulate DOM Range selection and operations on it? I would like to select a range of text, which can possibly pass through multiple inline nodes (such as <a>, <span> etc.), start or end in the middle of such inline nodes etc. In JavaScript it's easy with Range operations, extracting a DocumentFragment form it, surrounding it etc. I guess JavaScript Range is splitting the inner nodes as needed to handle such extraction and insertion back correctly. How would I do this with jsoup in Java?
Edit: Just thinking out loud how to do this - probably would need to search for the "peak" element within my range, then go to both start and end of the range and "elevate" them to the "peak level" by jumping up to the parent if my start is the child no. 0, or else splitting the element children list just before the range start element... If there is such a code ready, I'd rather re-use it, else will have to write it from scratch.
Update Dec. 18, 2015: Posted my answer with the working code I developed, see below.
Here is my promised code for wrapping an arbitrary range of DOM body into an arbitrary html tag for easy extraction, moving, replacement, copy/paste like operations etc.
Update Dec. 19, 2015 Added TextNode splitting in the middle of text by means of wrapRange() method variant with optional offsets into the text node where the range should start or end. Now arbitrary copy/paste/move within jsoup DOM model are possible.
TODO: (for myself or some other good soul)
The RangeWrapper.java module:
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.parser.Tag;
import java.util.ArrayList;
/**
* Created by greg on 12/18/2015.
*/
public class RangeWrapper {
/**
* Wrap the supplied HTML around the "range" from startEl to endEl.*
* @param startEl the first element to be included into the range
* @param endEl the last element to be included into the range
* @param html HTML to wrap around this element, e.g.
* {@code <span class="head"></span>}. Can be arbitrarily deep.
* @return the wrapping element
*/
public static Element wrapRange(Node startEl, Node endEl, String html) {
if (startEl == endEl) { // special case
return (Element) startEl.wrap(html).parentNode();
}
int startDepth = NodeWalker.getNodeDepth(startEl);
int endDepth = NodeWalker.getNodeDepth(endEl);
int minDepth = getRangeMinDepth(startEl, endEl);
int n;
while (startDepth > minDepth) {
Element parent = (Element)startEl.parentNode();
if ((n = startEl.siblingIndex()) > 0) {
// splitting the parent
ArrayList<Node> children = new ArrayList<Node>(parent.childNodes());
Element parent2 = new Element(Tag.valueOf(parent.tagName()), parent.baseUri(), parent.attributes());
parent.after(parent2);
for (int i = n; i < children.size(); i++)
parent2.appendChild(children.get(i));
startEl = parent2;
} else {
startEl = parent;
}
startDepth--;
}
while (endDepth > minDepth) {
Element parent = (Element)endEl.parentNode();
if ((n = endEl.siblingIndex()) < parent.children().size()-1) {
// splitting the parent
ArrayList<Node> children = new ArrayList<Node>(parent.childNodes());
Element parent2 = new Element(Tag.valueOf(parent.tagName()), parent.baseUri(), parent.attributes());
parent.before(parent2);
for (int i = 0; i <= n; i++)
parent2.appendChild(children.get(i));
endEl = parent2;
} else {
endEl = parent;
}
endDepth--;
}
// Now startEl and endEl are on the same depth == minDepth.
// Wrap the range with our html string
Element range = (Element) startEl.wrap(html).parentNode();
Node nextToAppend;
do {
nextToAppend = range.nextSibling();
// If nextToAppend is null, something is really wrong...
// Commented out to let it crash and investigate,
// so far it did not happen.
//if (nextToAppend == null)
// break;
range.appendChild(nextToAppend);
} while (nextToAppend != endEl);
return range;
}
/**
* Wrap the supplied HTML around the "range" from startEl to endEl.*
* @param startEl the first element to be included into the range
* @param stOffset if startEl is TextNode, split at this offset
* and include only the tail. Otherwise ignored.
* @param endEl the last element to be included into the range
* @param endOffset if endEl is a Text node, split at this offset
* and include only the head. Otherwise ignored.
* @param html HTML to wrap around this element, e.g. {@code <span class="head"></span>}. Can be arbitrarily deep.
* @return the wrapping element
*/
public static Element wrapRange(Node startEl, int stOffset, Node endEl, int endOffset, String html) {
if (stOffset > 0 && startEl instanceof TextNode) {
TextNode tn = (TextNode) startEl;
if (endOffset < tn.getWholeText().length()-1) {
startEl = tn.splitText(stOffset); // Splits tn and adds tail to DOM, returns tail
}
}
if (endOffset > 0 && endEl instanceof TextNode) {
TextNode tn = (TextNode) endEl;
if (endOffset < tn.getWholeText().length()-1) {
tn.splitText(stOffset); // Splits tn and adds tail to DOM, we take head == original endEl
}
}
return wrapRange(startEl, endEl, html);
}
/**
* Calculate the depth of the range between the two given nodes, relative to body.
* The body has depth 0.
* @param startNode the first element to be included into the range
* @param endNode the last element to be included into the range
* @return minimum depth found in the range
*/
public static int getRangeMinDepth(final Node startNode, final Node endNode) {
class DepthVisitor implements NodeWalker.NodeWalkVisitor {
private int _minDepth = Integer.MAX_VALUE;
public boolean head(Node node, int depth) {
if (depth < _minDepth)
_minDepth = depth;
return true;
}
public boolean tail(Node node, int depth) {return true;}
int getMinDepth() { return _minDepth; }
};
DepthVisitor visitor = new DepthVisitor();
NodeWalker nw = new NodeWalker(visitor);
nw.walk(startNode, endNode);
return visitor.getMinDepth();
}
}
...and the NodeWalker.java the above code uses, adapted from NodeTraversor and NodeVisitor classes in jsoup package:
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeVisitor;
/**
* Depth-first node traversor. Use to iterate through all nodes under and including the specified root node.
* <p>
* This implementation does not use recursion, so a deep DOM does not risk blowing the stack.
* </p>
*/
public class NodeWalker {
private NodeWalkVisitor visitor;
/**
* Create a new traversor.
* @param visitor a class implementing the {@link NodeVisitor} interface, to be called when visiting each node.
*/
public NodeWalker(NodeWalkVisitor visitor) {
this.visitor = visitor;
}
/**
* Start a depth-first traverse of the whole body and all of its descendants.
* @param startNode the arbitrary start point node point within body to traverse from.
* @param endNode the arbitrary end point node point within body where we stop traverse.
* Can be null, in which case we walk until the end of the body.
*/
public void walk(Node startNode, Node endNode) {
Node node = startNode;
int depth = getNodeDepth(startNode); // let's calulate depth relative to body, body is depth 0
while (node != null) {
if (!visitor.head(node, depth))
break;
if (node.childNodeSize() > 0) {
node = node.childNode(0);
depth++;
} else {
while (node.nextSibling() == null && depth > 0) {
if (!visitor.tail(node, depth) || node == endNode)
return;
node = node.parentNode();
depth--;
}
if (!visitor.tail(node, depth) || node == endNode)
break;
node = node.nextSibling();
}
}
}
// The walkBack() was not needed, but leaving it here, may be useful for something...
// /**
// * Start a depth-first backward traverse of the whole body and all of its descendants.
// * @param startNode the arbitrary start point node point within body to traverse from.
// * @param endNode the arbitrary end point node point within body where we stop traverse.
// * Can be null, in which case we walk until the end of the body.
// */
// public void walkBack(Node startNode, Node endNode) {
// Node node = startNode;
// int depth = getNodeDepth(startNode); // let's calulate depth relative to body, body is depth 0
//
// while (node != null) {
// if (!visitor.tail(node, depth))
// break;
// if (node.childNodeSize() > 0) {
// node = node.childNode(node.childNodeSize() - 1);
// depth++;
// } else {
// while (node.previousSibling() == null && depth > 0) {
// if (!visitor.head(node, depth) || node == endNode)
// return;
// node = node.parentNode();
// depth--;
// }
// if (!visitor.head(node, depth) || node == endNode)
// break;
// node = node.previousSibling();
// }
// }
// }
/**
* Calculate the depth of the given node relative to body. The body has depth 0.
* @param givenNode the node within the body to calculate depth for.
* @return the depth of the givenNode
*/
public static int getNodeDepth(Node givenNode) {
Node node = givenNode;
int depth = 0; // let's calulate depth relative to body, body is depth 0
if (!(node instanceof Element) || !"body".equals(((Element) node).tagName())) {
do {
depth++;
node = (Element)node.parentNode();
} while (node != null && !"body".equals(((Element) node).tagName()));
}
return depth;
}
public interface NodeWalkVisitor {
/**
* Callback for when a node is first visited.
*
* @param node the node being visited.
* @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
* of that will have depth 1.
* @return true to continue walk, false to abort
*/
boolean head(Node node, int depth);
/**
* Callback for when a node is last visited, after all of its descendants have been visited.
*
* @param node the node being visited.
* @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
* of that will have depth 1.
* @return true to continue walk, false to abort
*/
boolean tail(Node node, int depth);
}
}
Greg