Search code examples
javasolrlucenenutch

Nutch crawling giving error "Error from server at http://localhost:8983/solr/nutch: java.lang.NullPointerException"


I'm trying to crawl a website and index it using Solr. I've some custom fields which are updated using UpdateRequestProcessor. My custom URP looks like this:

FieldProcessorFactory.java

package ved;

import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.update.processor.UpdateRequestProcessorFactory;

public class FieldProcessorFactory extends UpdateRequestProcessorFactory
{
  @Override
  public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next)
  {
    return new FieldProcessor(next);
  }
}

FieldProcessor.java

package ved;

import java.io.IOException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import java.util.StringTokenizer;

class FieldProcessor extends UpdateRequestProcessor
{
  public FieldProcessor( UpdateRequestProcessor next) {
    super( next );
  }

  public static int countWords(String arg) {
      if (arg == null) {
          return 0;
      }

      return arg.split("[\\pP\\s&&[^']]+").length;
  }

  @Override
  public void processAdd(AddUpdateCommand cmd) throws IOException {
      String title = (String) cmd.getSolrInputDocument().get("title").getValue(); 
      int title_length_i = countWords(title);
      cmd.getSolrInputDocument().addField("title_length_i", title_length_i);

      String anchor = (String) cmd.getSolrInputDocument().get("anchor").getValue(); 
      int anchor_length_i = countWords(anchor);
      cmd.getSolrInputDocument().addField("anchor_length_i", anchor_length_i);

      String body = (String) cmd.getSolrInputDocument().get("content").getValue(); 
      int body_length_i = countWords(body);
      cmd.getSolrInputDocument().addField("body_length_i", body_length_i);

      String url = (String) cmd.getSolrInputDocument().get("url").getValue(); 
      int url_length_i = countWords(url);
      cmd.getSolrInputDocument().addField("url_length_i", url_length_i);

      int whole_document_length_i = title_length_i  + body_length_i + url_length_i;
      cmd.getSolrInputDocument().addField("whole_document_length_i", whole_document_length_i);


      super.processAdd(cmd);
  }
}

schema.xml

<field name="title_length_i" type="int" stored="true" indexed="true"/>
    <field name="anchor_length_i" type="int" stored="true" indexed="true"/>
    <field name="body_length_i" type="int" stored="true" indexed="true"/>
    <field name="url_length_i" type="int" stored="true" indexed="true"/>
    <field name="whole_document_length_i" type="int" stored="true" indexed="true"/>

solrconfig.xml

  <lib dir="${solr.install.dir:../../../..}/plugins/" regex="fieldProcessor.jar" />

  <updateRequestProcessorChain name="process-articles" lib="custom-libs" version="1">
      <processor class="ved.FieldProcessorFactory"/>
      <processor class="solr.LogUpdateProcessorFactory" />
      <processor class="solr.DistributedUpdateProcessorFactory" />
      <processor class="solr.RunUpdateProcessorFactory" />
  </updateRequestProcessorChain>

  <initParams path="/update/**,/query,/select,/tvrh,/elevate,/spell,/browse">
    <lst name="defaults">
      <str name="df">text</str>
      <str name="update.chain">process-articles</str>
    </lst>
  </initParams>

But during crawling I'm getting following error:

java.lang.Exception: org.apache.solr.client.solrj.impl.HttpSolrClient$RemoteSolrException: Error from server at http://localhost:8983/solr/nutch: java.lang.NullPointerException
    at ved.FieldProcessor.processAdd(FieldProcessor.java:29)
    at org.apache.solr.handler.loader.JavabinLoader$1.update(JavabinLoader.java:98)
    at org.apache.solr.client.solrj.request.JavaBinUpdateRequestCodec$1.readOuterMostDocIterator(JavaBinUpdateRequestCodec.java:180)
    at org.apache.solr.client.solrj.request.JavaBinUpdateRequestCodec$1.readIterator(JavaBinUpdateRequestCodec.java:136)
    at org.apache.solr.common.util.JavaBinCodec.readObject(JavaBinCodec.java:306)
    at org.apache.solr.common.util.JavaBinCodec.readVal(JavaBinCodec.java:251)
    at org.apache.solr.client.solrj.request.JavaBinUpdateRequestCodec$1.readNamedList(JavaBinUpdateRequestCodec.java:122)
    at org.apache.solr.common.util.JavaBinCodec.readObject(JavaBinCodec.java:271)
    at org.apache.solr.common.util.JavaBinCodec.readVal(JavaBinCodec.java:251)
    at org.apache.solr.common.util.JavaBinCodec.unmarshal(JavaBinCodec.java:173)
    at org.apache.solr.client.solrj.request.JavaBinUpdateRequestCodec.unmarshal(JavaBinUpdateRequestCodec.java:187)
    at org.apache.solr.handler.loader.JavabinLoader.parseAndLoadDocs(JavabinLoader.java:108)
    at org.apache.solr.handler.loader.JavabinLoader.load(JavabinLoader.java:55)
    at org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:97)
    at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:68)
    at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:173)
    at org.apache.solr.core.SolrCore.execute(SolrCore.java:2477)
    at org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:723)
    at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:529)
    at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:361)
    at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:305)
    at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1691)
    at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:582)
    at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
    at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:548)
    at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:226)
    at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1180)
    at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:512)
    at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185)
    at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1112)
    at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
    at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:213)
    at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:119)
    at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:134)
    at org.eclipse.jetty.rewrite.handler.RewriteHandler.handle(RewriteHandler.java:335)
    at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:134)
    at org.eclipse.jetty.server.Server.handle(Server.java:534)
    at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:320)
    at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:251)
    at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:273)
    at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:95)
    at org.eclipse.jetty.io.SelectChannelEndPoint$2.run(SelectChannelEndPoint.java:93)
    at org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.executeProduceConsume(ExecuteProduceConsume.java:303)
    at org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.produceConsume(ExecuteProduceConsume.java:148)
    at org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.run(ExecuteProduceConsume.java:136)
    at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:671)
    at org.eclipse.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:589)
    at java.lang.Thread.run(Thread.java:748)

    at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:462)
    at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:529)
Caused by: org.apache.solr.client.solrj.impl.HttpSolrClient$RemoteSolrException: Error from server at http://localhost:8983/solr/nutch: java.lang.NullPointerException
    at ved.FieldProcessor.processAdd(FieldProcessor.java:29)
    at org.apache.solr.handler.loader.JavabinLoader$1.update(JavabinLoader.java:98)
    at org.apache.solr.client.solrj.request.JavaBinUpdateRequestCodec$1.readOuterMostDocIterator(JavaBinUpdateRequestCodec.java:180)
    at org.apache.solr.client.solrj.request.JavaBinUpdateRequestCodec$1.readIterator(JavaBinUpdateRequestCodec.java:136)
    at org.apache.solr.common.util.JavaBinCodec.readObject(JavaBinCodec.java:306)
    at org.apache.solr.common.util.JavaBinCodec.readVal(JavaBinCodec.java:251)
    at org.apache.solr.client.solrj.request.JavaBinUpdateRequestCodec$1.readNamedList(JavaBinUpdateRequestCodec.java:122)
    at org.apache.solr.common.util.JavaBinCodec.readObject(JavaBinCodec.java:271)
    at org.apache.solr.common.util.JavaBinCodec.readVal(JavaBinCodec.java:251)
    at org.apache.solr.common.util.JavaBinCodec.unmarshal(JavaBinCodec.java:173)
    at org.apache.solr.client.solrj.request.JavaBinUpdateRequestCodec.unmarshal(JavaBinUpdateRequestCodec.java:187)
    at org.apache.solr.handler.loader.JavabinLoader.parseAndLoadDocs(JavabinLoader.java:108)
    at org.apache.solr.handler.loader.JavabinLoader.load(JavabinLoader.java:55)
    at org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:97)
    at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:68)
    at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:173)
    at org.apache.solr.core.SolrCore.execute(SolrCore.java:2477)
    at org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:723)
    at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:529)
    at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:361)
    at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:305)
    at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1691)
    at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:582)
    at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
    at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:548)
    at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:226)
    at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1180)
    at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:512)
    at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185)
    at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1112)
    at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
    at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:213)
    at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:119)
    at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:134)
    at org.eclipse.jetty.rewrite.handler.RewriteHandler.handle(RewriteHandler.java:335)
    at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:134)
    at org.eclipse.jetty.server.Server.handle(Server.java:534)
    at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:320)
    at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:251)
    at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:273)
    at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:95)
    at org.eclipse.jetty.io.SelectChannelEndPoint$2.run(SelectChannelEndPoint.java:93)
    at org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.executeProduceConsume(ExecuteProduceConsume.java:303)
    at org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.produceConsume(ExecuteProduceConsume.java:148)
    at org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.run(ExecuteProduceConsume.java:136)
    at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:671)
    at org.eclipse.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:589)
    at java.lang.Thread.run(Thread.java:748)

    at org.apache.solr.client.solrj.impl.HttpSolrClient.executeMethod(HttpSolrClient.java:576)
    at org.apache.solr.client.solrj.impl.HttpSolrClient.request(HttpSolrClient.java:240)
    at org.apache.solr.client.solrj.impl.HttpSolrClient.request(HttpSolrClient.java:229)
    at org.apache.solr.client.solrj.SolrClient.request(SolrClient.java:1219)
    at org.apache.nutch.indexwriter.solr.SolrIndexWriter.push(SolrIndexWriter.java:210)
    at org.apache.nutch.indexwriter.solr.SolrIndexWriter.commit(SolrIndexWriter.java:188)
    at org.apache.nutch.indexwriter.solr.SolrIndexWriter.close(SolrIndexWriter.java:179)
    at org.apache.nutch.indexer.IndexWriters.close(IndexWriters.java:117)
    at org.apache.nutch.indexer.IndexerOutputFormat$1.close(IndexerOutputFormat.java:44)
    at org.apache.hadoop.mapred.ReduceTask$OldTrackingRecordWriter.close(ReduceTask.java:502)
    at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:456)
    at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:392)
    at org.apache.hadoop.mapred.LocalJobRunner$Job$ReduceTaskRunnable.run(LocalJobRunner.java:319)
    at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
    at java.util.concurrent.FutureTask.run(FutureTask.java:266)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
2020-02-22 14:31:49,730 ERROR indexer.IndexingJob - Indexer: java.io.IOException: Job failed!
    at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:873)
    at org.apache.nutch.indexer.IndexingJob.index(IndexingJob.java:147)
    at org.apache.nutch.indexer.IndexingJob.run(IndexingJob.java:230)
    at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
    at org.apache.nutch.indexer.IndexingJob.main(IndexingJob.java:239)

What might be going wrong ?


Solution

  • If the number of lines on the FieldProcessor.java file matches the one from the exception the lines seem to be coming from:

    String anchor = (String) cmd.getSolrInputDocument().get("anchor").getValue(); 
    

    For what I can see the get(key) method can return a null value if the requested field it is not present in the SolrInputDocument instance. You should check that:

    cmd.getSolrInputDocument().get("anchor")
    

    returns a non-null value.

    In my experience with Nutch, the anchor field is not always present in the crawled documents. Also, keep in mind that if a field is declared in the schema doesn't necessarily imply that all documents will have that field (unless specified as required in which case, if it is not present, the document will be rejected by Solr in an earlier stage).