Search code examples
javacsvpos-tagger

How to ignore an ASCII Character before parsing?


import java.io.*;
import java.util.ArrayList;
import java.util.List;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;

public class TagText {
    public static void main(String[] args) throws IOException, ClassNotFoundException {
        // Initializing the tagger
        MaxentTagger tagger = new MaxentTagger("taggers/english-left3words-distsim.tagger");
        List<String> lines = new ArrayList<>();
        lines = new ReadCSV().readColumn("Tt2.csv", 4);
        for (String line : lines) {
            String tagged = tagger.tagString(line);
            System.out.println(tagged);
        }
    }
}

I'm trying to parse a CSV file and i have a character (BIN 10010111, —) value which i wanted to the text parser to ignore this character. How would i do that ?


Solution

  • So i guess you want to remove all special characters?

    I guess it was sth like: replaceAll("[^\w\s]", "");

    Edit: Full Code

    import java.io.*;
    import java.util.ArrayList;
    import java.util.List;
    import edu.stanford.nlp.tagger.maxent.MaxentTagger;
    
    public class TagText {
        public static void main(String[] args) throws IOException, ClassNotFoundException {
            // Initializing the tagger
            MaxentTagger tagger = new MaxentTagger("taggers/english-left3words-distsim.tagger");
            List<String> lines = new ArrayList<>();
            lines = new ReadCSV().readColumn("Tt2.csv", 4);
            for (String line : lines) {
                String tagged = tagger.tagString(line.replace("\uFFFD",""));
                System.out.println(tagged);
            }
        }
    }