Search code examples
javajava-8java-stream

Optimizing parsing a semicolon-delimited HTML header value with Java streams


I was looking through some code and came across this method that takes an HTML Header value (i.e. Content-Disposition=inline;filename=foo.bar) and parses it into a map separated by the semi-colon's into key=value pairs. At first it looked like a good candidate for optimization using a stream, but after I implemented it, the fact that I can't reuse the computed String.indexOf('=') value means the string must be scanned 3 times, which is actually less optimal than the original.

I'm perfectly aware that there are many instances where streams aren't the right tool for the job, but I was wondering if I had just missed some technique that could allow the Stream to be as performant/more performant than the initial code.

/**
 * Convert a Header Value String into a Map
 *
 * @param value The Header Value
 * @return The data Map
 */
private static Map<String,String> headerMap (String value) {
  int eq;
  Map<String,String> map = new HashMap<>();
  for(String entry : value.split(";")) {
    if((eq = entry.indexOf('=')) != -1) {
      map.put(entry.substring(0,eq),entry.substring(eq + 1));
    }
  }
  return map;

  return Stream.of(value.split(";")).filter(entry -> entry.indexOf('=') != -1).collect(Collectors.));
} //headerMap

My attempt at streaming it:

/**
 * Convert a Header Value String into a Map
 *
 * @param value The Header Value
 * @return The data Map
 */
private static Map<String,String> headerMap (String value) {
  return Stream.of(value.split(";")).filter(entry -> entry.indexOf('=') != -1).collect(Collectors.toMap(entry -> entry.substring(0,entry.indexOf('=')),entry -> entry.substring(entry.substring(entry.indexOf('=') + 1))));
} //headerMap

Solution

  • I came up with the following code:

    private static Map<String, String> headerMap(String value) {
        return Stream.of(value.split(";"))
                    .filter(entry -> entry.indexOf('=') != -1)
                    .map(entry -> {
                        int i = entry.indexOf('=');
                        return new String[] { entry.substring(0, i), entry.substring(i + 1) };
                    })
                    .collect(Collectors.toMap(array -> array[0], array -> array[1]));
    }
    

    It only scans for the entry two times, by storing the key and value inside an array of size 2. I'm not sure it will be as performant as the for loop since we are creating another Object to serve just as a holder.

    Another solution that scans the entry only one time is this, although I'm not very found of it:

    private static Map<String, String> headerMap(String value) {
        return Stream.of(value.split(";"))
                .map(entry -> {
                    int i = entry.indexOf('=');
                    if (i == -1) {
                        return null;
                    }
                    return new String[] { entry.substring(0, i), entry.substring(i + 1) };
                })
                .filter(Objects::nonNull)
                .collect(Collectors.toMap(array -> array[0], array -> array[1]));
    }
    

    I realized a JMH benchmark to test this. Following is the benchmark code:

    @Warmup(iterations = 5, time = 1000, timeUnit = TimeUnit.MILLISECONDS)
    @Measurement(iterations = 10, time = 1000, timeUnit = TimeUnit.MILLISECONDS)
    @BenchmarkMode(Mode.AverageTime)
    @OutputTimeUnit(TimeUnit.MICROSECONDS)
    @Fork(3)
    @State(Scope.Benchmark)
    public class StreamTest {
    
        private static final String VALUE = "Accept=text/plain;"
            + "Accept-Charset=utf-8;"
            + "Accept-Encoding=gzip, deflate;"
            + "Accept-Language=en-US;"
            + "Accept-Datetime=Thu, 31 May 2007 20:35:00 GMT;"
            + "Cache-Control=no-cache;"
            + "Connection=keep-alive;"
            + "Content-Length=348;"
            + "Content-Type=application/x-www-form-urlencoded;"
            + "Date=Tue, 15 Nov 1994 08:12:31 GMT;"
            + "Expect=100-continue;"
            + "Max-Forwards=10;"
            + "Pragma=no-cache";
    
        @Benchmark
        public void loop() {
            int eq;
            Map<String, String> map = new HashMap<>();
            for (String entry : VALUE.split(";")) {
                if ((eq = entry.indexOf('=')) != -1) {
                    map.put(entry.substring(0, eq), entry.substring(eq + 1));
                }
            }
        }
    
        @Benchmark
        public void stream1() {
            Stream.of(VALUE.split(";"))
            .filter(entry -> entry.indexOf('=') != -1)
            .map(entry -> {
                int i = entry.indexOf('=');
                return new String[] { entry.substring(0, i), entry.substring(i + 1) };
            })
            .collect(Collectors.toMap(array -> array[0], array -> array[1]));
        }
    
        @Benchmark
        public void stream2() {
            Stream.of(VALUE.split(";"))
            .map(entry -> {
                int i = entry.indexOf('=');
                if (i == -1) {
                    return null;
                }
                return new String[] { entry.substring(0, i), entry.substring(i + 1) };
            })
            .filter(Objects::nonNull)
            .collect(Collectors.toMap(array -> array[0], array -> array[1]));
        }
    
        public static void main(String[] args) throws Exception {
             Main.main(args);
        }
    
    }
    

    and this is the result (Code i5 3230M CPU @ 2.60 GHz, Windows 10, Oracle JDK 1.8.0_25):

    Benchmark           Mode  Cnt  Score   Error  Units
    StreamTest.loop     avgt   30  1,541 ± 0,038  us/op
    StreamTest.stream1  avgt   30  1,633 ± 0,042  us/op
    StreamTest.stream2  avgt   30  1,604 ± 0,058  us/op
    

    What this demonstrates is that both the streams solution and the for loop are actually equivalent in terms of performance.