The text in the main method seem to be taking more than 2 seconds to return NER. I am not an expert in NLP and this code is not at all scalable. I have added comments in 2 places where the bottleneck i have identified. Can you please suggest improvements to improve the performance of the program.
Thanks.
public class NERSentimentUtil
{
private static final Logger logger = Logger.getLogger(NERSentimentUtil.class);
private static final String serializedClassifier7 = "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz";
private static final String serializedClassifier4 = "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz";
private static final String serializedClassifier3 = "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz";
private static NERClassifierCombiner ncc;
private static StanfordCoreNLP pipeline;
static
{
try
{
ncc = new NERClassifierCombiner(serializedClassifier3,serializedClassifier4,serializedClassifier7);
} catch (IOException e) {
e.printStackTrace();
logger.error(e);
}
}
static
{
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, parse, sentiment, sutime");
/*props.setProperty("ner.useSUTime", "0");*/
String defs_sutime = "/edu/stanford/nlp/models/sutime/defs.sutime.txt";
String holiday_sutime = "/edu/stanford/nlp/models/sutime/english.holidays.sutime.txt";
String _sutime = "/edu/stanford/nlp/models/sutime/english.sutime.txt";
String sutimeRules = defs_sutime + "," + holiday_sutime + "," + _sutime;
props.setProperty("ner.useSUTime", "true");
props.setProperty("-sutime.rules", sutimeRules);
props.setProperty("sutime.binders", "0");
props.setProperty("sutime.markTimeRanges", "false");
props.setProperty("sutime.includeRange", "false");
props.setProperty("customAnnotatorClass.sutime", "edu.stanford.nlp.time.TimeAnnotator");
props.setProperty("parse.maxlen", "20");
//props.setProperty("ner.applyNumericClassifiers", "false");
//props.setProperty("nthreads", "16");
//props.setProperty("threads", "16");
//props.setProperty("parse.nthreads","16");
//props.setProperty("ssplit.eolonly","true");
props.setProperty("-parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz");
RedwoodConfiguration.current().clear().apply();
pipeline = new StanfordCoreNLP(props);
//RedwoodConfiguration.empty().capture(System.err).apply();
}
//A sentiment score of 0 or 1 is negative, 2 neutral and 3 or 4 positive.
private static int getScore(int score)
{
if(score<2)
return -1;
else if(score==2)
return 0;
else
return 1;
}
public static HashMap<String,Object> getStanford(String s, long dateString)//"2013-07-14"
{
int finalScore =0;
HashMap<String,Object> map = new HashMap<String,Object>();
HashMap<String, Integer> dateMap = new HashMap<String, Integer>();
HashMap<String, Integer> dateCountMap = new HashMap<String, Integer>();
HashMap<String, String> dateSentenceMap = new HashMap<String, String>();
HashMap<String, Integer> personMap = new HashMap<String, Integer>();
HashMap<String, Integer> personCountMap = new HashMap<String, Integer>();
HashMap<String, Integer> orgMap = new HashMap<String, Integer>();
HashMap<String, Integer> orgCountMap = new HashMap<String, Integer>();
HashMap<String, Integer> locationMap = new HashMap<String, Integer>();
HashMap<String, Integer> locationCountMap = new HashMap<String, Integer>();
HashMap<String, Article_Location> locationArticleMap = new HashMap<String, Article_Location>();
ArrayList<Articel_Ner> organisationlist = new ArrayList<Articel_Ner>();
ArrayList<Articel_Ner> personlist = new ArrayList<Articel_Ner>();
ArrayList<Artilcle_Ner_Date> datelist = new ArrayList<Artilcle_Ner_Date>();
ArrayList<Article_NerLocation> locationList = new ArrayList<Article_NerLocation>();
try
{
Annotation annotation = pipeline.process(s);//1/3 rd time is taken up by this line
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences)
{
String str = sentence.toString();
int score = getSentiment(sentence);
finalScore+=score;
boolean dFlag = true;
List<Triple<String,Integer,Integer>> triples = ncc.classifyToCharacterOffsets(str);
for (Triple<String,Integer,Integer> trip : triples)
{
String ne = trip.first();
String word = str.substring(trip.second(), trip.third).toLowerCase();
switch(ne)
{
case "LOCATION":
extractLocation(locationMap, locationCountMap, locationArticleMap, score, word);
break;
case "ORGANIZATION":
extractOrg(orgMap, orgCountMap, score, word);
break;
case "PERSON":
extractPerson(personMap, personCountMap, score, word);
break;
case "DATE":
if(dFlag)
{
extractSUDate(dateString, dateMap, dateCountMap, dateSentenceMap, str, score);
dFlag = false;
}
break;
default:
break;
}
}
}
//2/3rd of the time taken by these 4 methods:: can be obtimized
mapDate(dateMap, dateCountMap, dateSentenceMap, datelist);
mapLocation(locationMap, locationCountMap, locationArticleMap, locationList);
mapOrg(orgMap, orgCountMap, organisationlist);
mapPerson(personMap, personCountMap, personlist);
//
}
catch(Exception e)
{
logger.error(e);
logger.error(s);
e.printStackTrace();
}
if(finalScore>0)
finalScore = 1;
else if(finalScore<0)
finalScore = -1;
else
finalScore = 0;
map.put("ORGANISATION", organisationlist);
map.put("PERSON", personlist);
map.put("DATE", datelist);
map.put("LOCATION", locationList);
map.put("SENTIMENT", finalScore);
return map;
}
private static void extractPerson(HashMap<String, Integer> personMap, HashMap<String, Integer> personCountMap,
int score, String word)
{
if(personMap.get(word)!=null)
{
personMap.put(word, personMap.get(word)+score);
personCountMap.put(word, personCountMap.get(word)+1);
}
else
{
personMap.put(word, score);
personCountMap.put(word, 1);
//personSentenceMap.put(pname, str);
}
}
private static void extractOrg(HashMap<String, Integer> orgMap, HashMap<String, Integer> orgCountMap,
int score, String word)
{
if(orgMap.get(word)!=null)
{
orgMap.put(word, orgMap.get(word)+score);
orgCountMap.put(word, orgCountMap.get(word)+1);
}
else
{
orgMap.put(word, score);
orgCountMap.put(word, 1);
//orgSentenceMap.put(oname, str);
}
}
private static void extractLocation(HashMap<String, Integer> locationMap,
HashMap<String, Integer> locationCountMap,
HashMap<String, Article_Location> locationArticleMap,
int score,
String word)
{
if(locationMap.get(word)!=null)
{
locationMap.put(word, locationMap.get(word)+score);
locationCountMap.put(word, locationCountMap.get(word)+1);
}
else
{
Article_Location articleLocation = LocationUtil.getLocation(word);
locationMap.put(word, score);
locationCountMap.put(word, 1);
locationArticleMap.put(word, articleLocation);
}
}
private static void extractSUDate(long dateString,
HashMap<String, Integer> dateMap,
HashMap<String, Integer> dateCountMap,
HashMap<String, String> dateSentenceMap,
String str,
int score) {
Annotation dateAnnotation = new Annotation(str);
dateAnnotation.set(CoreAnnotations.DocDateAnnotation.class, FormatUtil.getDate(dateString));
pipeline.annotate(dateAnnotation);
for(CoreMap timex:dateAnnotation.get(TimeAnnotations.TimexAnnotations.class))
{
TimeExpression timeExpression = timex.get(TimeExpression.Annotation.class);
if(timeExpression!=null && timeExpression.getTemporal()!=null &&
timeExpression.getTemporal().getTimexValue()!=null)
{
String word = checkDate(timeExpression.getTemporal().getTimexValue());
if(word!=null)
{
if(dateMap.get(word)!=null)
{
dateMap.put(word, dateMap.get(word)+score);
dateCountMap.put(word, dateCountMap.get(word)+1);
dateSentenceMap.put(word, dateSentenceMap.get(word)+" "+str);
}
else
{
dateMap.put(word, score);
dateCountMap.put(word, 1);
dateSentenceMap.put(word, str);
}
}
}
}
}
private static int getSentiment(CoreMap sentence) {
Tree annotatedTree = sentence.get(SentimentAnnotatedTree.class);
int localScore = RNNCoreAnnotations.getPredictedClass(annotatedTree);
int score = getScore(localScore);
return score;
}
private static void mapLocation(HashMap<String, Integer> locationMap,
HashMap<String, Integer> locationCountMap,
HashMap<String, Article_Location> locationArticleMap,
ArrayList<Article_NerLocation> locationList)
{
for(Map.Entry<String, Integer> entry : locationMap.entrySet())
{
String key = entry.getKey();
Integer value = entry.getValue();
Article_Location articleLocation = locationArticleMap.get(key);
Article_NerLocation l1 = new Article_NerLocation();
if(value>=1)
l1.setNerSentiment(1);
else if(value<=-1)
l1.setNerSentiment(-1);
else
l1.setNerSentiment(0);
l1.setKeyword(key);
l1.setCount(locationCountMap.get(key));
if(articleLocation!=null)
{
l1.setNerCountry(articleLocation.getCountryCode());
l1.setNerLatLong(articleLocation.getLatitude()+","+articleLocation.getLongitude());
l1.setTimeZone(articleLocation.getTimeZone());
l1.setCountryName(articleLocation.getCountryName());
}
locationList.add(l1);
}
}
private static void mapDate(HashMap<String, Integer> dateMap,
HashMap<String, Integer> dateCountMap,
HashMap<String, String> dateSentenceMap,
ArrayList<Artilcle_Ner_Date> datelist)
{
for(Map.Entry<String, Integer> entry : dateMap.entrySet())
{
String key = entry.getKey();
Integer value = entry.getValue();
Artilcle_Ner_Date d1 = new Artilcle_Ner_Date();
if(value>=1)
d1.setNerSentiment(1);
else if(value<=-1)
d1.setNerSentiment(-1);
else
d1.setNerSentiment(0);
d1.setKeyword(key);
d1.setCount(dateCountMap.get(key));
d1.setSentence(dateSentenceMap.get(key));
d1.setNerDateTheme1(SummaryThemeUtil.getSTByDate(dateSentenceMap.get(key)));
datelist.add(d1);
}
}
private static void mapOrg(HashMap<String, Integer> orgMap,
HashMap<String, Integer> orgCountMap,
ArrayList<Articel_Ner> organisationlist)
{
for(Map.Entry<String, Integer> entry : orgMap.entrySet())
{
String key = entry.getKey();
Integer value = entry.getValue();
Articel_Ner o1 = new Articel_Ner();
if(value>=1)
o1.setNerSentiment(1);
else if(value<=-1)
o1.setNerSentiment(-1);
else
o1.setNerSentiment(0);
o1.setKeyword(key);
o1.setCount(orgCountMap.get(key));
organisationlist.add(o1);
}
}
private static void mapPerson(HashMap<String, Integer> personMap,
HashMap<String, Integer> personCountMap,
ArrayList<Articel_Ner> personlist)
{
for(Map.Entry<String, Integer> entry : personMap.entrySet())
{
String key = entry.getKey();
Integer value = entry.getValue();
Articel_Ner p1 = new Articel_Ner();
if(value>=1)
p1.setNerSentiment(1);
else if(value<=-1)
p1.setNerSentiment(-1);
else
p1.setNerSentiment(0);
p1.setKeyword(key);
p1.setCount(personCountMap.get(key));
personlist.add(p1);
}
}
private static String checkDate(String date)
{
if(date.length()<10)
return null;
else if(date.length()>10)
date = date.substring(0,10);
if (date.matches("\\d{4}-\\d{2}-\\d{2}"))
return date;
else
return null;
}
public static void main(String args[])
{
String text = "Lets meet on every 2nd week. Night is young. Happy new Year. The festival will be held on the following dates are 18 Feb 1997, the 20th of july and 4 days from today.";
long pre = System.currentTimeMillis();
HashMap<String, Object> map = getStanford(text, 1508745558);
long post = System.currentTimeMillis();
long diff = post-pre;
System.out.println(diff);
System.out.println(map);
}
}
After days and days of sore black eyes. Here is where the problem is:
Stanford "parse" model whether PCFG or SRparser both are CPU killers. You will never be able to scale. At best i was doing 70 docs/second. This is with 15 threads that i was able to manage on tomcat. The docs where being consumed from RabbitMQ. Machine Intel Xeon 8Core VM with 15 GB RAM. The CPU was always 90%.
So if you want to do NER,sentiment,sutime. Its better to use separate libraries and not use stanford for all 3. For NER you can use NERClassifierCombiner from stanford. For sentiment you can use weka. For extracting dates you can use natty.
Now we are able to do 2,000 docs/second.