Search code examples
jrubyfuzzy-searchruby-c-extensionjruby-java-interopmri

Use gems with c-extensions in JRuby, or port them to Java?


I'm doing a fuzzy match test between an input string and some previously entered strings. The test is performed live while typing.

I already have a shockingly accurate algorithm in place called StrikeAMatch, which has been translated into many languages. The fastest Ruby implementation I've found is amatch. However, it is incompatible with my JRuby environment because it crunches data in a C extension that requires the C interpreter for Ruby (MRI). It's pretty fast though:

a = "Lorem ipsum dolor"
b = "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam cursus. Morbi ut mi. Nullam enim leo, egestas id, condimentum at, laoreet mattis, massa. Sed eleifend nonummy diam. Praesent mauris ante,"
puts Benchmark.measure {
  10000.times { a.pair_distance_similar(b) }
}
# => 0.130000   0.000000   0.130000 (  0.146347)

I hope I can avoid setting up an alternative environment. An alternative approach could be to try and port the original Java code as suggested in the JRuby Wiki. Not sure how to do that though.

Any ideas about how to approach this?


Solution

  • The algorithm is easy to implement. For example, here's a quick implementation I wrote in Java:

    import java.util.HashMap;
    import java.util.HashSet;
    import java.util.Map;
    import java.util.Set;
    
    public class StrikeAMatch {
    
        protected int numPairs(final String string) {
            return Math.max(0, string.length() - 1);
        }
    
        protected Map<String, Integer> pairs(final String string) {
            if (string.isEmpty()) {
                return new HashMap<>(0);
            }
    
            final Map<String, Integer> pairs = new HashMap(2 * numPairs(string));
    
            for (int i = 1, j = string.length(); i < j; i += 1) {
                final char a = string.charAt(i - 1);
                final char b = string.charAt(i);
                final String pair = String.format("%c%c", a, b);
                if (pairs.containsKey(pair)) {
                    pairs.put(pair, 1 + pairs.get(pair));
                }
                else {
                    pairs.put(pair, 1);
                }
            }
    
            return pairs;
        }
    
        protected int intersectionSize(
                final Map<String, Integer> lhsPairs,
                final Map<String, Integer> rhsPairs) {
            final Set<String> lhsKeys = lhsPairs.keySet();
            final Set<String> rhsKeys = rhsPairs.keySet();
            final Set<String> pairIntersection = new HashSet<>(lhsKeys);
            pairIntersection.retainAll(rhsKeys);
            int sharedPairs = 0;
            for (final String pair : pairIntersection) {
                sharedPairs += Math.min(lhsPairs.get(pair), rhsPairs.get(pair));
            }
            return sharedPairs;
        }
    
        public double between(final String lhs, final String rhs) {
            if (lhs.isEmpty() && rhs.isEmpty()) {
                return 1.0;
            }
            final Map<String, Integer> lhsPairs = pairs(lhs.toUpperCase());
            final Map<String, Integer> rhsPairs = pairs(rhs.toUpperCase());
            return (2.0 * intersectionSize(lhsPairs, rhsPairs))
                 / (numPairs(lhs) + numPairs(rhs));
        }
    
        public static void main(final String[] args) {
            final StrikeAMatch distance = new StrikeAMatch();
            for (final String lhs : args) {
                for (final String rhs : args) {
                    System.out.printf("d(\"%s\", \"%s\") = [%.7f]%n",
                        lhs, rhs, distance.between(lhs, rhs));
                }
            }
        }
    }
    

    You can even pad the first and last characters, to extend it to one-character or zero-character terms:

    import java.util.HashMap;
    import java.util.Map;
    
    public class PaddedStrikeAMatch extends StrikeAMatch {
    
        protected int numPairs(final String string) {
            return string.length() + 1;
        }
    
        protected Map<String, Integer> pairs(final String string) {
            if (string.isEmpty()) {
                final Map<String, Integer> pairs = new HashMap<>(1);
                pairs.put("  ", 1);
                return pairs;
            }
    
            final Map<String, Integer> pairs = new HashMap(2 * numPairs(string));
            pairs.put(String.format(" %c", string.charAt(0)), 1);
            pairs.put(String.format("%c ", string.charAt(string.length() - 1)), 1);
    
            for (int i = 1, j = string.length(); i < j; i += 1) {
                final char a = string.charAt(i - 1);
                final char b = string.charAt(i);
                final String pair = String.format("%c%c", a, b);
                if (pairs.containsKey(pair)) {
                    pairs.put(pair, 1 + pairs.get(pair));
                }
                else {
                    pairs.put(pair, 1);
                }
            }
    
            return pairs;
        }
    
        public static void main(final String[] args) {
            final StrikeAMatch distance = new PaddedStrikeAMatch();
            for (final String lhs : args) {
                for (final String rhs : args) {
                    System.out.printf("d(\"%s\", \"%s\") = [%.7f]%n",
                        lhs, rhs, distance.between(lhs, rhs));
                }
            }
        }
    }
    

    To validate it, here are the examples proposed in the link you provided:

    % java StrikeAMatch France French
    d("France", "France") = [1.0000000]
    d("France", "French") = [0.4000000]
    d("French", "France") = [0.4000000]
    d("French", "French") = [1.0000000]
    
    % java StrikeAMatch Healed Sealed Healthy Heard Herded Help Sold
    d("Healed", "Healed") = [1.0000000]
    d("Healed", "Sealed") = [0.8000000]
    d("Healed", "Healthy") = [0.5454545]
    d("Healed", "Heard") = [0.4444444]
    d("Healed", "Herded") = [0.4000000]
    d("Healed", "Help") = [0.2500000]
    d("Healed", "Sold") = [0.0000000]
    d("Sealed", "Healed") = [0.8000000]
    d("Sealed", "Sealed") = [1.0000000]
    d("Sealed", "Healthy") = [0.3636364]
    d("Sealed", "Heard") = [0.2222222]
    d("Sealed", "Herded") = [0.2000000]
    d("Sealed", "Help") = [0.0000000]
    d("Sealed", "Sold") = [0.0000000]
    d("Healthy", "Healed") = [0.5454545]
    d("Healthy", "Sealed") = [0.3636364]
    d("Healthy", "Healthy") = [1.0000000]
    d("Healthy", "Heard") = [0.4000000]
    d("Healthy", "Herded") = [0.1818182]
    d("Healthy", "Help") = [0.2222222]
    d("Healthy", "Sold") = [0.0000000]
    d("Heard", "Healed") = [0.4444444]
    d("Heard", "Sealed") = [0.2222222]
    d("Heard", "Healthy") = [0.4000000]
    d("Heard", "Heard") = [1.0000000]
    d("Heard", "Herded") = [0.4444444]
    d("Heard", "Help") = [0.2857143]
    d("Heard", "Sold") = [0.0000000]
    d("Herded", "Healed") = [0.4000000]
    d("Herded", "Sealed") = [0.2000000]
    d("Herded", "Healthy") = [0.1818182]
    d("Herded", "Heard") = [0.4444444]
    d("Herded", "Herded") = [1.0000000]
    d("Herded", "Help") = [0.2500000]
    d("Herded", "Sold") = [0.0000000]
    d("Help", "Healed") = [0.2500000]
    d("Help", "Sealed") = [0.0000000]
    d("Help", "Healthy") = [0.2222222]
    d("Help", "Heard") = [0.2857143]
    d("Help", "Herded") = [0.2500000]
    d("Help", "Help") = [1.0000000]
    d("Help", "Sold") = [0.0000000]
    d("Sold", "Healed") = [0.0000000]
    d("Sold", "Sealed") = [0.0000000]
    d("Sold", "Healthy") = [0.0000000]
    d("Sold", "Heard") = [0.0000000]
    d("Sold", "Herded") = [0.0000000]
    d("Sold", "Help") = [0.0000000]
    d("Sold", "Sold") = [1.0000000]
    

    ... and here's the padded version:

    % java PaddedStrikeAMatch France French
    d("France", "France") = [1.0000000]
    d("France", "French") = [0.4285714]
    d("French", "France") = [0.4285714]
    d("French", "French") = [1.0000000]
    
    % java PaddedStrikeAMatch Healed Sealed Healthy Heard Herded Help Sold
    d("Healed", "Healed") = [1.0000000]
    d("Healed", "Sealed") = [0.7142857]
    d("Healed", "Healthy") = [0.5333333]
    d("Healed", "Heard") = [0.6153846]
    d("Healed", "Herded") = [0.5714286]
    d("Healed", "Help") = [0.3333333]
    d("Healed", "Sold") = [0.1666667]
    d("Sealed", "Healed") = [0.7142857]
    d("Sealed", "Sealed") = [1.0000000]
    d("Sealed", "Healthy") = [0.2666667]
    d("Sealed", "Heard") = [0.3076923]
    d("Sealed", "Herded") = [0.2857143]
    d("Sealed", "Help") = [0.0000000]
    d("Sealed", "Sold") = [0.3333333]
    d("Healthy", "Healed") = [0.5333333]
    d("Healthy", "Sealed") = [0.2666667]
    d("Healthy", "Healthy") = [1.0000000]
    d("Healthy", "Heard") = [0.4285714]
    d("Healthy", "Herded") = [0.2666667]
    d("Healthy", "Help") = [0.3076923]
    d("Healthy", "Sold") = [0.0000000]
    d("Heard", "Healed") = [0.6153846]
    d("Heard", "Sealed") = [0.3076923]
    d("Heard", "Healthy") = [0.4285714]
    d("Heard", "Heard") = [1.0000000]
    d("Heard", "Herded") = [0.6153846]
    d("Heard", "Help") = [0.3636364]
    d("Heard", "Sold") = [0.1818182]
    d("Herded", "Healed") = [0.5714286]
    d("Herded", "Sealed") = [0.2857143]
    d("Herded", "Healthy") = [0.2666667]
    d("Herded", "Heard") = [0.6153846]
    d("Herded", "Herded") = [1.0000000]
    d("Herded", "Help") = [0.3333333]
    d("Herded", "Sold") = [0.1666667]
    d("Help", "Healed") = [0.3333333]
    d("Help", "Sealed") = [0.0000000]
    d("Help", "Healthy") = [0.3076923]
    d("Help", "Heard") = [0.3636364]
    d("Help", "Herded") = [0.3333333]
    d("Help", "Help") = [1.0000000]
    d("Help", "Sold") = [0.0000000]
    d("Sold", "Healed") = [0.1666667]
    d("Sold", "Sealed") = [0.3333333]
    d("Sold", "Healthy") = [0.0000000]
    d("Sold", "Heard") = [0.1818182]
    d("Sold", "Herded") = [0.1666667]
    d("Sold", "Help") = [0.0000000]
    d("Sold", "Sold") = [1.0000000]
    

    If you want to use it directly from JRuby, you need only to add StrikeAMatch.class to your $CLASSPATH, and within your script require 'java' followed by java_import 'StrikeAMatch:

    require 'java'
    
    java_import 'StrikeAMatch'
    
    distance = StrikeAMatch.new
    ARGV.each do |lhs|
      ARGV.each do |rhs|
        puts "d(\"#{lhs}\", \"#{rhs}\") = [#{distance.between(lhs, rhs)}]"
      end
    end
    

    To invoke it:

    % jruby strike_a_match.rb France French
    d("France", "France") = [1.0]
    d("France", "French") = [0.4]
    d("French", "France") = [0.4]
    d("French", "French") = [1.0]