Search code examples
solrsolrjsolr4apache-tika

Many PDFs indexed but only one returned in te Solr-UI


I followed the example here for indexing all the pdfs in a directory. The process seems to work well, but at the end, when I go in the Solr-UI and click on "Execute query"(with q=*:*), I get only one entry.

Do I miss something in my code?

...
String[] files = documentDir.list();

if (files != null)
{
  for (String document : files)
  {       
    ContentHandler textHandler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    AutoDetectParser autoDetectParser = new AutoDetectParser();

    InputStream inputStream = null;

    try
    {
      inputStream = new FileInputStream(new File(documentDir, document));

      autoDetectParser.parse(inputStream, textHandler, metadata, context);

      SolrInputDocument doc = new SolrInputDocument();
      doc.addField("id", document);

      String content = textHandler.toString();

      if (content != null)
      {
        doc.addField("fullText", content);
      }

      UpdateResponse resp = server.add(doc, 1);

      server.commit(true, true, true);

      if (resp.getStatus() != 0)
      {
        throw new IDSystemException(LOG, "Document could not be indexed. Status returned: " + resp.getStatus());
      }
    }
    catch (FileNotFoundException fnfe)
    {
      throw new IDSystemException(LOG, fnfe.getMessage(), fnfe);
    }
    catch (IOException ioe)
    {
      throw new IDSystemException(LOG, ioe.getMessage(), ioe);
    }
    catch (SAXException se)
    {
      throw new IDSystemException(LOG, se.getMessage(), se);
    }
    catch (TikaException te)
    {
      throw new IDSystemException(LOG, te.getMessage(), te);
    }
    catch (SolrServerException sse)
    {
      throw new IDSystemException(LOG, sse.getMessage(), sse);
    }
    finally
    {
      if (inputStream != null)
      {
        try
        {
          inputStream.close();
        }
        catch (IOException ioe)
        {
          throw new IDSystemException(LOG, ioe.getMessage(), ioe);
        }
      }
    }
    ...

Solution

  • I had the "signatureField" bound to the "uid" in the solrconfig.xml, so the uid was always the same. Now I defined a new field for the "signatureField" and it works!

    Before:

    ...
    <updateRequestProcessorChain name="deduplication">
        <processor
            class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
            <bool name="overwriteDupes">false</bool>
            <str name="signatureField">uid</str>  <---------------------
            <bool name="enabled">true</bool>
            <str name="fields">content</str>
            <str name="minTokenLen">10</str>
            <str name="quantRate">.2</str>
            <str name="signatureClass">solr.update.processor.TextProfileSignature</str>
        </processor>
        <processor class="solr.LogUpdateProcessorFactory" />
        <processor class="solr.RunUpdateProcessorFactory" />
    </updateRequestProcessorChain>
    ...
    
    
    ...
    <fields>
        <field name="uid" type="string" indexed="true" stored="true" multiValued="false" />
        <dynamicField name="ignored_*" type="ignored" multiValued="true" indexed="false" stored="fasle" />
        <field name="id" type="string" indexed="true" stored="true" multiValued="false" />
        <field name="fullText" indexed="true" type="text" multiValued="true" />
    </fields>
    <uniqueKey>uid</uniqueKey>
    ...
    

    After:

    ...
    
    <updateRequestProcessorChain name="deduplication">
        <processor
            class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
            <bool name="overwriteDupes">false</bool>
            <str name="signatureField">signatureField</str>  <---------------------
            <bool name="enabled">true</bool>
            <str name="fields">content</str>
            <str name="minTokenLen">10</str>
            <str name="quantRate">.2</str>
            <str name="signatureClass">solr.update.processor.TextProfileSignature</str>
        </processor>
        <processor class="solr.LogUpdateProcessorFactory" />
        <processor class="solr.RunUpdateProcessorFactory" />
    </updateRequestProcessorChain>
    ...
    
    
    ...
    <fields>
        <field name="uid" type="string" indexed="true" stored="true" multiValued="false" />
        <field name="signatureField" type="string" indexed="true" stored="true" multiValued="false" />  <----------------------------------
        <dynamicField name="ignored_*" type="ignored" multiValued="true" indexed="false" stored="fasle" />
        <field name="id" type="string" indexed="true" stored="true" multiValued="false" />
        <field name="fullText" indexed="true" type="text" multiValued="true" />
    </fields>
    <uniqueKey>uid</uniqueKey>
    ...