Is there another way to do a lot of 'replaceAll' more efficiently, using as less memory as possible?
public static String cleanWordTags(String source) {
String copy = source;
copy = copy.replaceAll("<P style=\"M[^>]*>", "<P>");
copy = copy.replaceAll("<p style=\"M[^>]*>", "<p>");
copy = copy.replaceAll("<p style=\"T[^>]*>", "<p>");
copy = copy.replaceAll("<b style=[^>]*>", "<b>");
copy = copy.replaceAll("<span class=\"M[^>]*>", "<span>");
copy = copy.replaceAll("<span style='m[^>]*>", "<span>");
copy = copy.replaceAll("<span style=\"f[^>]*>", "<span>");
copy = copy.replaceAll("<span lang[^>]*>", "<span>");
copy = copy.replaceAll("<span style=\"color[^>]*>", "<span>");
copy = copy.replaceAll("<span style=\"m[^>]*>", "<span>");
copy = copy.replaceAll("<span style=\"line[^>]*>", "<span>");
copy = copy.replaceAll("<span style=\"L[^>]*>", "<span>");
copy = copy.replaceAll("<span style=\"T[^>]*>", "<span>");
copy = copy.replaceAll("<span style=\"t[^>]*>", "<span>");
copy = copy.replaceAll("<br [^>]*>", "<br/>");
copy = copy.replaceAll("<i style=[^>]*>", "");
copy = copy.replaceAll("</i>", "");
copy = copy.replaceAll("<st1:personname[^>]*>", "");
copy = copy.replaceAll("</st1:personname>", "");
copy = copy.replaceAll("<st1:metricconverter[^>]*>", "");
copy = copy.replaceAll("</st1:metricconverter>", "");
copy = copy.replaceAll("<br[^>]*>", "<br/>");
copy = copy.replaceAll("<\\W\\Wendif\\W\\W\\W>", "");
copy = copy.replaceAll("<![^>]*>", "");
copy = copy.replaceAll("<[vowm]:[^>]*>", "");
copy = copy.replaceAll("</[vowm]:[^>]*>", ""); //&
copy = copy.replaceAll("&(amp|lt|gt);", "");
copy = copy.replaceAll(" ", "");
copy = copy.replaceAll("<img width[^>]*>", "");
copy = copy.replaceAll("<img src=\"file:[^>]*>", "");
return copy;
}
I found I can use StringUtils.replace instead replaceAll, but this only works for strings without regex.
Thanks!!!
New:
I tried with the next code related with comments but takes 5 times more time to replace the same String:
public static String cleanWordTags(String source) {
String copy = source;
long t0 = System.currentTimeMillis();
String regex = "";
regex += "(align=\"left\")";
regex += "|(<mce:style>)";
regex += "|(<i>)";
regex += "|(<i style=[^>]*>)";
regex += "|(</i>)";
regex += "|(<st1:personname[^>]*>)";
regex += "|(</st1:personname>)";
regex += "|(<st1:metricconverter[^>]*>)";
regex += "|(</st1:metricconverter>)";
regex += "|(<\\W\\Wendif\\W\\W\\W>)";
regex += "|(<![^>]*>)";
regex += "|(<[vowm]:[^>]*>)";
regex += "|(</[vowm]:[^>]*>)";
regex += "|(&(amp|lt|gt);)";
regex += "|( )";
regex += "|(<img width[^>]*>)";
regex += "|(<img src=\"file:[^>]*>)";
Pattern p = Pattern.compile(regex);
copy = p.matcher(copy.toUpperCase()).replaceAll("");
regex = "";
regex += "(<span style=\"t[^>]*>)";
regex += "|(<span style=\"T[^>]*>)";
regex += "|(<span style=\"L[^>]*>)";
regex += "|(<span style=\"line[^>]*>)";
regex += "|(<span style=\"m[^>]*>)";
regex += "|(<span style=\"color[^>]*>)";
regex += "|(<span lang[^>]*>)";
regex += "|(<span style=\"f[^>]*>)";
regex += "|(<span style='m[^>]*>)";
regex += "|(<span class=\"M[^>]*>)";
p = Pattern.compile(regex);
copy = p.matcher(copy.toUpperCase()).replaceAll("");
copy = copy.replaceAll("<br[^>]*>", "<br/>");
//Sustituir
// copy = copy.replaceAll("<p class=[^>]*>", "<p>");
// copy = copy.replaceAll("<p align=[^>]*>", "<p>");
copy = copy.replaceAll("<P style=\"M[^>]*>", "<P>");
copy = copy.replaceAll("<p style=\"M[^>]*>", "<p>");
copy = copy.replaceAll("<p style=\"T[^>]*>", "<p>");
copy = copy.replaceAll("<b style=[^>]*>", "<b>");
System.out.println(System.currentTimeMillis() - t0);
return copy;
}
At the end the only solution I found was to replace all the "replaceAll" without regex for "replace" and try to generalize the regex.
Thanks a lot!!!