Search code examples
javabufferedreadereol

Line endings confusion


I made a simple parser with java that reads a file one character at a time and constructs words.

I tried to run it under Linux and I noticed that looking for '\n' doesn't work. Although if I compare the character with the value 10 it works as expected. According to the ASCII table value 10 is LF (line feed). I read somewhere (I don't remember where) that Java should be able to find a newline only by looking for '\n'.

I am using BufferedReader and the read method to read characters.

EDIT

readLine cannot be used because it will produce other problems

It looks like the problem appears when I am using files with mac/windows file endings under linux.


Solution

  • here are two ways can do it

    1- use read line by line and split each using a regular expression to get the single words

    2- write your own isDelimiter method and use it to check whether you reached a split contition or not

    package misctests;
    
    import static org.junit.Assert.assertEquals;
    import static org.junit.Assert.assertNotNull;
    import java.util.ArrayList;
    import java.util.List;
    import org.junit.Test;
    
    
    public class SplitToWords {
    
        String someWords = "Lorem ipsum\r\n(dolor@sit)amet,\nconsetetur!\rsadipscing'elitr;sed~diam";
        String delimsRegEx = "[\\s;,\\(\\)!'@~]+";
        String delimsPlain = ";,()!'@~"; // without whitespaces
    
        String[] expectedWords = {
            "Lorem",
            "ipsum",
            "dolor",
            "sit",
            "amet",
            "consetetur",
            "sadipscing",
            "elitr",
            "sed",
            "diam"
        };
    
        private static final class StringReader {
            String input = null;
            int pos = 0;
            int len = 0;
            StringReader(String input) {
                this.input = input == null ? "" : input;
                len = this.input.length();
            }
    
            public boolean hasMoreChars() {
                return pos < len;
            }
    
            public int read() {
                return hasMoreChars() ? ((int) input.charAt(pos++)) : 0;
            }
        }
    
        @Test
        public void splitToWords_1() {
            String[] actual = someWords.split(delimsRegEx);
            assertEqualsWords(expectedWords, actual);
        }
    
        @Test
        public void splitToWords_2() {
            StringReader sr = new StringReader(someWords);
            List<String> words = new ArrayList<String>();
            StringBuilder sb = null;
            int c = 0;
            while(sr.hasMoreChars()) {
                c = sr.read();
                while(sr.hasMoreChars() && isDelimiter(c)) {
                    c = sr.read();
                }
                sb = new StringBuilder();
                while(sr.hasMoreChars() && ! isDelimiter(c)) {
                    sb.append((char)c);
                    c = sr.read();
                }
                if(! isDelimiter(c)) {
                    sb.append((char)c);
                }
                words.add(sb.toString());
            }
    
            String[] actual = new String[words.size()];
            words.toArray(actual);
    
            assertEqualsWords(expectedWords, actual);
        }
    
        private boolean isDelimiter(int c) {
            return (Character.isWhitespace(c) ||
                delimsPlain.contains(new String(""+(char)c))); // this part is subject for optimization
        }
    
        private void assertEqualsWords(String[] expected, String[] actual) {
            assertNotNull(expected);
            assertNotNull(actual);
            assertEquals(expected.length, actual.length);
            for(int i = 0; i < expected.length; i++) {
                assertEquals(expected[i], actual[i]);
            }
        }
    }