Search code examples
javafile-iostanford-nlp

Java - processing the content of each file within a directory


I am using Stanford CoreNLP in order to carry out sentiment analysis on 25,000 individual textual movie reviews all contained within one single directory. In order to do this I need to slightly alter the Stanford code as it only analyses each individual sentence within a single text file.

My attempt at carrying this out is as follows:

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.io.FileUtils;

import com.google.common.io.Files;

import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.CoreMap;
import java.io.File;
import java.util.Iterator;
import org.apache.commons.io.*;

/** A simple corenlp example ripped directly from the Stanford CoreNLP website using text from wikinews. */
public class sentimentMain {

  public static void main(String[] args) throws IOException {
    // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution 
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    // read some text from the file..
    Iterator it = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\neg"), null, false);
    Iterator it1 = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\pos"), null, false);
    Iterator it2 = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\unsup"), null, false);

    File inputFile  = new File ((String) (it.next()));
    String text = Files.toString(inputFile, Charset.forName("UTF-8"));
    System.out.println(text);

    //File inputFile = new File("C:/stanford-corenlp-full-2016-10-31/input.txt");
    //String text = Files.toString(inputFile, Charset.forName("UTF-8"));

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    // run all Annotators on this text
    pipeline.annotate(document);

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    for(CoreMap sentence: sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
        // this is the text of the token
        String word = token.get(TextAnnotation.class);
        // this is the POS tag of the token
        String pos = token.get(PartOfSpeechAnnotation.class);
        // this is the NER label of the token
        String ne = token.get(NamedEntityTagAnnotation.class);

        System.out.println("word: " + word + " pos: " + pos + " ne:" + ne);
      }

      // this is the parse tree of the current sentence
      Tree tree = sentence.get(TreeAnnotation.class);
      System.out.println("parse tree:\n" + tree);

      // this is the Stanford dependency graph of the current sentence
      SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
      System.out.println("dependency graph:\n" + dependencies);
    }

    // This is the coreference link graph
    // Each chain stores a set of mentions that link to each other,
    // along with a method for getting the most representative mention
    // Both sentence and token offsets start at 1!
    Map<Integer, CorefChain> graph = 
        document.get(CorefChainAnnotation.class);

  }

}

of which I receive the following error:

Exception in thread "main" java.lang.ClassCastException: java.io.File cannot be cast to java.lang.String
    at sentimentMain.main(sentimentMain.java:46)

I understand that "it.next()" can not be converted to a string, but does anyone know another way I can ensure the content of the files are being input as a string for processing?

Thanks in advance :)


Solution

  • Its a straight forward compilation error, which a decent IDE would have shown you. variable - "text" is not available outside the while loop, It should either be declared before start of while loop or the document declaration should be inside while loop.

    Please find below the edited code.

    import java.io.File;
    import java.io.IOException;
    import java.nio.charset.Charset;
    import java.util.Iterator;
    import java.util.List;
    import java.util.Map;
    import java.util.Properties;
    
    import org.apache.commons.io.FileUtils;
    
    import com.google.common.io.Files;
    
    import edu.stanford.nlp.dcoref.CorefChain;
    import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation;
    import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
    import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
    import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
    import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
    import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
    import edu.stanford.nlp.ling.CoreLabel;
    import edu.stanford.nlp.pipeline.Annotation;
    import edu.stanford.nlp.pipeline.StanfordCoreNLP;
    import edu.stanford.nlp.semgraph.SemanticGraph;
    import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
    import edu.stanford.nlp.trees.Tree;
    import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
    import edu.stanford.nlp.util.CoreMap;
    import java.io.File;
    import java.util.Iterator;
    import org.apache.commons.io.*;
    
    /** A simple corenlp example ripped directly from the Stanford CoreNLP website using text from wikinews. */
    public class sentimentMain {
    
      public static void main(String[] args) throws IOException {
        // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution 
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    
        // read some text from the file..
        Iterator it = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\neg"), null, false);
        Iterator it1 = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\pos"), null, false);
        Iterator it2 = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\unsup"), null, false);
    
        while(it.hasNext()){
    
            File inputFile  = new File ((String) (it.next()));
            String text = Files.toString(inputFile, Charset.forName("UTF-8"));
            System.out.println(text);
        //File inputFile = new File("C:/stanford-corenlp-full-2016-10-31/input.txt");
        //String text = Files.toString(inputFile, Charset.forName("UTF-8"));
    
        // create an empty Annotation just with the given text
        Annotation document = new Annotation(text);
    
        // run all Annotators on this text
        pipeline.annotate(document);
    
        // these are all the sentences in this document
        // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
        List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    
        for(CoreMap sentence: sentences) {
          // traversing the words in the current sentence
          // a CoreLabel is a CoreMap with additional token-specific methods
          for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
            // this is the text of the token
            String word = token.get(TextAnnotation.class);
            // this is the POS tag of the token
            String pos = token.get(PartOfSpeechAnnotation.class);
            // this is the NER label of the token
            String ne = token.get(NamedEntityTagAnnotation.class);
    
            System.out.println("word: " + word + " pos: " + pos + " ne:" + ne);
          }
    
          // this is the parse tree of the current sentence
          Tree tree = sentence.get(TreeAnnotation.class);
          System.out.println("parse tree:\n" + tree);
    
          // this is the Stanford dependency graph of the current sentence
          SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
          System.out.println("dependency graph:\n" + dependencies);
        }
    
        // This is the coreference link graph
        // Each chain stores a set of mentions that link to each other,
        // along with a method for getting the most representative mention
        // Both sentence and token offsets start at 1!
        Map<Integer, CorefChain> graph = 
            document.get(CorefChainAnnotation.class);
    
        }
      }
    
    }