Search code examples
javasplitbacktrackingtrie

split strings with backtracking


I'm trying to write a code that split a spaceless string into meaningful words but when I give sentence like "arealways" it returns ['a', 'real', 'ways'] and what I want is ['are', 'always'] and my dictionary contains all this words. How can I can write a code that keep backtracking till find the best matching?

the code that returns 'a', 'real', 'ways':

splitter.java:

public class splitter {

    HashMap<String, String> map = new HashMap<>();
    Trie dict;

    public splitter(Trie t) {
        dict = t;
    }

    public String split(String test) {
        if (dict.contains(test)) {
            return (test);
        } else if (map.containsKey(test)) {
            return (map.get(test));
        } else {
            for (int i = 0; i < test.length(); i++) {
                String pre = test.substring(0, i);
                if (dict.contains(pre)) {
                    String end = test.substring(i);
                    String fixedEnd = split(end);
                        if(fixedEnd != null){
                            map.put(test, pre + " " + fixedEnd);
                            return pre + " " + fixedEnd;
                        }else {
                        }
                    
                }
            }

        }
        map.put(test,null);
        return null;
    }
} 

Trie.java:

public class Trie {
    public static class TrieNode {
        private HashMap<Character, TrieNode> charMap = new HashMap<>();
        public char c;
        public boolean endOWord;
        public void insert(String s){
        }
        public boolean contains(String s){
            return true;
        }
    }
    public TrieNode root;
    
    public Trie() {
        root = new TrieNode();
    }
    
    public void insert(String s){
        TrieNode p = root;
        for(char c : s.toCharArray()) {
            if(! p.charMap.containsKey(c)) {
                TrieNode node = new TrieNode();
                node.c = c;
                p.charMap.put(c, node);
            }
            p = p.charMap.get(c);
        }
        p.endOWord = true;
    }
    public boolean contains(String s){
        TrieNode p = root;
        for(char c : s.toCharArray()) {
            if(!p.charMap.containsKey(c)) {
                return false;
            }
            p = p.charMap.get(c);
        }
        return p.endOWord;
    }
    public void insertDictionary(String filename) throws FileNotFoundException{
        File file = new File(filename);
        Scanner sc = new Scanner(file);
        while(sc.hasNextLine())
            insert(sc.nextLine());
    }
    

    public void insertDictionary(File file) throws FileNotFoundException{
        Scanner sc = new Scanner(file);
        while(sc.hasNextLine())
            insert(sc.nextLine());
    }
} 

WordSplitter class:

public class WordSplitter {

    public static void main(String[] args) throws FileNotFoundException {
            
           String test = "arealways";
           String myFile = "/Users/abc/Desktop/dictionary.txt";
           Trie dict = new Trie();
           dict.insertDictionary(myFile);
          splitter sp = new splitter(dict);
          test = sp.split(test);
          
          if(test != null)
          System.out.println(test);
          else
          System.out.println("No Splitting Found.");            
           
    }

        } 

Solution

  • Using the OP's split method and the implementation of Trie found in The Trie Data Structure in Java Baeldung's article, I was able to get the following results:

    realways=real ways
    arealways=a real ways
    

    However, if I remove the word "real" or "a" from the dictionary, I get the following results:

    realways=null
    arealways=are always
    

    Here's the entire code I used to get these results:

    public class Splitter {
    
        private static Map<String, String> map = new HashMap<>();
        private Trie dict;
        
        public Splitter(Trie t) {
            dict = t;
        }
    
        /**
         * @param args
         */
        public static void main(String[] args) {
            List<String> words = List.of("a", "always", "are", "area", "r", "way", "ways"); // The order of these words does not seem to impact the final result
            String test = "arealways";
    
            Trie t = new Trie();
            for (String word : words) {
                t.insert(word);
            }
            System.out.println(t);
            Splitter splitter = new Splitter(t);
            splitter.split(test);
            map.entrySet().forEach(System.out::println);
        }
    
        public String split(String test) {
            if (dict.find(test)) {
                return (test);
            } else if (map.containsKey(test)) {
                return (map.get(test));
            } else {
                for (int i = 0; i < test.length(); i++) {
                    String pre = test.substring(0, i);
                    if (dict.find(pre)) {
                        String end = test.substring(i);
                        String fixedEnd = split(end);
                        if (fixedEnd != null) {
                            map.put(test, pre + " " + fixedEnd);
                            return pre + " " + fixedEnd;
                        } else {
                        }
    
                    }
                }
    
            }
            map.put(test, null);
            return null;
        }
    
        public static class Trie {
            private TrieNode root = new TrieNode();
    
            public boolean find(String word) {
                TrieNode current = root;
                for (int i = 0; i < word.length(); i++) {
                    char ch = word.charAt(i);
                    TrieNode node = current.getChildren().get(ch);
                    if (node == null) {
                        return false;
                    }
                    current = node;
                }
                return current.isEndOfWord();
            }
    
            public void insert(String word) {
                TrieNode current = root;
                for (char l : word.toCharArray()) {
                    current = current.getChildren().computeIfAbsent(l, c -> new TrieNode());
                }
                current.setEndOfWord(true);
            }
            
            @Override
            public String toString() {
                return toString(root);
            }
    
            /**
             * @param root2
             * @return
             */
            private String toString(TrieNode node) {
                return node.toString();
            }
    
            public static class TrieNode {
                private Map<Character, TrieNode> children = new HashMap<>() ;
                private String contents;
                private boolean endOfWord;
    
                public Map<Character, TrieNode> getChildren() {
                    return children;
                }
    
                public void setEndOfWord(boolean endOfWord) {
                    this.endOfWord = endOfWord;
                }
    
                public boolean isEndOfWord() {
                    return endOfWord;
                }
                
                @Override
                public String toString() {
                    StringBuilder sbuff = new StringBuilder();
                    if (isLeaf()) {
                        return sbuff.toString();
                    }
                    children.entrySet().forEach(entry -> {
                        sbuff.append(entry.getKey() + "\n");
                    });
                    sbuff.append(" ");
                    return children.toString();
                }
                
                private boolean isLeaf() {
                    return children.isEmpty();
                }
            }
            
            public void delete(String word) {
                delete(root, word, 0);
            }
    
            private boolean delete(TrieNode current, String word, int index) {
                if (index == word.length()) {
                    if (!current.isEndOfWord()) {
                        return false;
                    }
                    current.setEndOfWord(false);
                    return current.getChildren().isEmpty();
                }
                char ch = word.charAt(index);
                TrieNode node = current.getChildren().get(ch);
                if (node == null) {
                    return false;
                }
                
                boolean shouldDeleteCurrentNode = delete(node, word, index + 1) && !node.isEndOfWord();
    
                if (shouldDeleteCurrentNode) {
                    current.getChildren().remove(ch);
                    return current.getChildren().isEmpty();
                }
                return false;
            }
        }
    }
    

    I improved the original code by adding a toString() method to the Trie and TrieNode. Now, when I print out the Trie object "t", I get the following result:

    {a={r={e={a=}}, l={w={a={y={s=}}}}}, w={a={y={s=}}}}
    

    My conclusion is that the OP's TrieNode implementation is incorrect. The way the Trie is built, given the inputted string value, the behavior described by the OP seems to be correct.