Search code examples
indexingsolrluceneapache-tika

Parse image as well as text from pdf using tika in solr(Request Handler)


I am trying to index pdf file using solr 6 and want to extract and save image(if have) to some location. I am using below configuration but not able to extract the image. I have successfully index the pdf text contents.

schema.xml

<?xml version="1.0" encoding="UTF-8" ?>
 <schema name="Breast-Cancer_PDFSchema" version="1.6">
 <uniqueKey>id</uniqueKey>
<field name="id" type="strings" multiValued="false" indexed="true" required="true" stored="true"/>
<field name="_version_" type="long" indexed="true" stored="true"/>
<field name="date" type="tdates" indexed="true" stored="true"/>
<field name="pdf_pdfversion" type="strings" indexed="true" stored="true"/>
<field name="stream_content_type" type="strings" indexed="true" stored="true"/>
<field name="access_permission_modify_annotations" type="strings" indexed="true" stored="true"/>    
<field name="access_permission_can_print_degraded" type="strings" indexed="true" stored="true"/>
<field name="dcterms_created" type="strings" indexed="true" stored="true"/>
<field name="last_modified" type="strings" indexed="true" stored="true"/>
<field name="dcterms_modified" type="strings" indexed="true" stored="true"/>
<field name="dc_format" type="strings" indexed="true" stored="true"/>
<field name="last_save_date" type="strings" indexed="true" stored="true"/>
<field name="access_permission_fill_in_form" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_modified" type="strings" indexed="true" stored="true"/>
<field name="stream_name" type="strings" indexed="true" stored="true"/>
<field name="meta_save_date" type="strings" indexed="true" stored="true"/>
<field name="pdf_encrypted" type="strings" indexed="true" stored="true"/>
<field name="modified" type="strings" indexed="true" stored="true"/>
<field name="content_type" type="strings" indexed="true" stored="true"/>
<field name="stream_size" type="strings" indexed="true" stored="true"/>
<field name="x_parsed_by" type="strings" indexed="true" stored="true"/>
<field name="meta_creation_date" type="strings" indexed="true" stored="true"/>
<field name="stream_source_info" type="strings" indexed="true" stored="true"/>
<field name="created" type="strings" indexed="true" stored="true"/>
<field name="access_permission_extract_for_accessibility" type="strings" indexed="true" stored="true"/>
<field name="access_permission_assemble_document" type="strings" indexed="true" stored="true"/>
<field name="xmptpg_npages" type="strings" indexed="true" stored="true"/>
<field name="creation_date" type="strings" indexed="true" stored="true"/>
<field name="access_permission_extract_content" type="strings" indexed="true" stored="true"/>
<field name="access_permission_can_print" type="strings" indexed="true" stored="true"/>
<field name="producer" type="strings" indexed="true" stored="true"/>
<field name="subject" type="strings" indexed="true" stored="true"/>
<field name="dc_creator" type="strings" indexed="true" stored="true"/>
<field name="aapl_keywords" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_producer" type="strings" indexed="true" stored="true"/>
<field name="resourcename" type="strings" indexed="true" stored="true"/>
<field name="access_permission_can_modify" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_created" type="strings" indexed="true" stored="true"/>
<field name="_text_" type="strings" indexed="true" stored="true"/>



<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
<fieldType name="strings" class="solr.TextField" sortMissingLast="true" multiValued="true" />
<fieldType name="long" class="solr.TrieLongField" positionIncrementGap="0" precisionStep="0"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
<fieldType name="tdates" class="solr.TrieDateField" positionIncrementGap="0" multiValued="true" precisionStep="6"/>
<fieldType name="tlongs" class="solr.TrieLongField" positionIncrementGap="0" multiValued="true" precisionStep="8"/>
<fieldType name="tdoubles" class="solr.TrieDoubleField" positionIncrementGap="0" multiValued="true" precisionStep="8"/>

solr-config.xml

  <requestHandler name="/update/extract" startup="lazy" class="org.apache.solr.handler.extraction.ExtractingRequestHandler" >
<entries>
    <entry class="org.apache.tika.parser.pdf.AutoDetectParser"> </entry>
</entries>
<lst name="defaults">
  <str name="lowernames">true</str>
  <str name="fmap.meta">ignored_</str>
  <str name="fmap.content">_text_</str>
  <str name="fmap.id">id</str>
</lst>  
</requestHandler> 

I have followed the apache solr offical documentation but made changes to solr-config.xml according to them still have the same issue.


Solution

  • After reading your question and if I am not wrong, you are posting PDF file (which contains some text and images) to solr. The solr will only index that document it will not extract the images and store them to other location.

    Solr internally uses Tika libraries for parsing the documents, but it cannot be used in your requirement

    To achieve your requirement,

    1. parse your pdf and extract all the images and other contents and store them
    2. index all the extracted contents and pdf in solr.