I made a simple parser with java that reads a file one character at a time and constructs words.
I tried to run it under Linux and I noticed that looking for '\n'
doesn't work. Although if I compare the character with the value 10
it works as expected. According to the ASCII table value 10 is LF (line feed). I read somewhere (I don't remember where) that Java should be able to find a newline only by looking for '\n'
.
I am using BufferedReader
and the read
method to read characters.
readLine
cannot be used because it will produce other problems
It looks like the problem appears when I am using files with mac/windows file endings under linux.
here are two ways can do it
1- use read line by line and split each using a regular expression to get the single words
2- write your own isDelimiter method and use it to check whether you reached a split contition or not
package misctests;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import java.util.ArrayList;
import java.util.List;
import org.junit.Test;
public class SplitToWords {
String someWords = "Lorem ipsum\r\n(dolor@sit)amet,\nconsetetur!\rsadipscing'elitr;sed~diam";
String delimsRegEx = "[\\s;,\\(\\)!'@~]+";
String delimsPlain = ";,()!'@~"; // without whitespaces
String[] expectedWords = {
"Lorem",
"ipsum",
"dolor",
"sit",
"amet",
"consetetur",
"sadipscing",
"elitr",
"sed",
"diam"
};
private static final class StringReader {
String input = null;
int pos = 0;
int len = 0;
StringReader(String input) {
this.input = input == null ? "" : input;
len = this.input.length();
}
public boolean hasMoreChars() {
return pos < len;
}
public int read() {
return hasMoreChars() ? ((int) input.charAt(pos++)) : 0;
}
}
@Test
public void splitToWords_1() {
String[] actual = someWords.split(delimsRegEx);
assertEqualsWords(expectedWords, actual);
}
@Test
public void splitToWords_2() {
StringReader sr = new StringReader(someWords);
List<String> words = new ArrayList<String>();
StringBuilder sb = null;
int c = 0;
while(sr.hasMoreChars()) {
c = sr.read();
while(sr.hasMoreChars() && isDelimiter(c)) {
c = sr.read();
}
sb = new StringBuilder();
while(sr.hasMoreChars() && ! isDelimiter(c)) {
sb.append((char)c);
c = sr.read();
}
if(! isDelimiter(c)) {
sb.append((char)c);
}
words.add(sb.toString());
}
String[] actual = new String[words.size()];
words.toArray(actual);
assertEqualsWords(expectedWords, actual);
}
private boolean isDelimiter(int c) {
return (Character.isWhitespace(c) ||
delimsPlain.contains(new String(""+(char)c))); // this part is subject for optimization
}
private void assertEqualsWords(String[] expected, String[] actual) {
assertNotNull(expected);
assertNotNull(actual);
assertEquals(expected.length, actual.length);
for(int i = 0; i < expected.length; i++) {
assertEquals(expected[i], actual[i]);
}
}
}