Search code examples
web-crawlerapache-tikastormcrawler

Stormcrawler: Apache Tika for parsing PDF properties


I have added Tika as a reference to my StormCrawler implementation and that enables to fetch the PDF documents in the crawl. But, the Title, Authors and other properties don't get parsed. I have tried with different combinations to 'index.md.mapping:' and added the corresponding properties to ES_IndexInit, but the content field in Kibana (index) for PDF's documents is always empty. Everything works for HTML pages. Can you please help with some pointers, if I am missing something or I can look at an example?


es-crawler.flux:

name: "crawler"

includes: - resource: true file: "/crawler-default.yaml" override: false

- resource: false
  file: "crawler-conf.yaml"
  override: true

- resource: false
  file: "es-conf.yaml"
  override: true

spouts: - id: "spout" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout" parallelism: 10

bolts: - id: "partitioner" className: "com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt" parallelism: 1 - id: "fetcher" className: "com.digitalpebble.stormcrawler.bolt.FetcherBolt" parallelism: 1 - id: "sitemap" className: "com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt" parallelism: 1 - id: "parse" className: "com.digitalpebble.stormcrawler.bolt.JSoupParserBolt" parallelism: 5 - id: "index" className: "com.digitalpebble.stormcrawler.elasticsearch.bolt.IndexerBolt" parallelism: 1 - id: "status" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt" parallelism: 1 - id: "status_metrics" className: "com.digitalpebble.stormcrawler.elasticsearch.metrics.StatusMetricsBolt" parallelism: 4 - id: "redirection_bolt" className: "com.digitalpebble.stormcrawler.tika.RedirectionBolt" parallelism: 1 - id: "parser_bolt" className: "com.digitalpebble.stormcrawler.tika.ParserBolt" parallelism: 1

streams: - from: "spout" to: "partitioner" grouping: type: SHUFFLE

  • from: "spout" to: "status_metrics" grouping: type: SHUFFLE

  • from: "partitioner" to: "fetcher" grouping: type: FIELDS args: ["key"]

  • from: "fetcher" to: "sitemap" grouping: type: LOCAL_OR_SHUFFLE

  • from: "sitemap" to: "parse" grouping: type: LOCAL_OR_SHUFFLE

  • from: "parse" to: "index" grouping: type: LOCAL_OR_SHUFFLE

  • from: "fetcher" to: "status" grouping: type: FIELDS args: ["url"] streamId: "status"

  • from: "sitemap" to: "status" grouping: type: FIELDS args: ["url"] streamId: "status"

  • from: "parse" to: "status" grouping: type: FIELDS args: ["url"] streamId: "status"

  • from: "index" to: "status" grouping: type: FIELDS args: ["url"] streamId: "status"

  • from: "parse" to: "redirection_bolt" grouping: type: LOCAL_OR_SHUFFLE

  • from: "redirection_bolt" to: "parser_bolt" grouping: type: LOCAL_OR_SHUFFLE

  • from: "redirection_bolt" to: "index" grouping: type: LOCAL_OR_SHUFFLE

  • from: "parser_bolt" to: "index" grouping: type: LOCAL_OR_SHUFFLE

es-injector.flux: name: "injector"

includes: - resource: true file: "/crawler-default.yaml" override: false

- resource: false
  file: "crawler-conf.yaml"
  override: true

- resource: false
  file: "es-conf.yaml"
  override: true

- resource: false
  file: "injection-conf.yaml"
  override: true

components: - id: "scheme" className: "com.digitalpebble.stormcrawler.util.StringTabScheme" constructorArgs: - DISCOVERED

spouts: - id: "spout" className: "com.digitalpebble.stormcrawler.spout.FileSpout" parallelism: 1 constructorArgs: - "." - "seeds.txt" - ref: "scheme"

bolts: - id: "status" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt" parallelism: 1 - id: "parser_bolt" className: "com.digitalpebble.stormcrawler.tika.ParserBolt" parallelism: 1

streams: - from: "spout" to: "status" grouping: type: FIELDS args: ["url"]

pom.xml: http://maven.apache.org/maven-v4_0_0.xsd">

<modelVersion>4.0.0</modelVersion>
<groupId>xyz.com</groupId>
<artifactId>search</artifactId>
<version>search1.0</version>
<packaging>jar</packaging>

<properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<build>
    <plugins>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-compiler-plugin</artifactId>
            <version>3.2</version>
            <configuration>
                <source>1.8</source>
                <target>1.8</target>
            </configuration>
        </plugin>
        <plugin>
            <groupId>org.codehaus.mojo</groupId>
            <artifactId>exec-maven-plugin</artifactId>
            <version>1.3.2</version>
            <executions>
                <execution>
                    <goals>
                        <goal>exec</goal>
                    </goals>
                </execution>
            </executions>
            <configuration>
                <executable>java</executable>
                <includeProjectDependencies>true</includeProjectDependencies>
                <includePluginDependencies>false</includePluginDependencies>
                <classpathScope>compile</classpathScope>
            </configuration>
        </plugin>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-shade-plugin</artifactId>
            <version>1.3.3</version>
            <executions>
                <execution>
                    <phase>package</phase>
                    <goals>
                        <goal>shade</goal>
                    </goals>
                    <configuration>
                        <createDependencyReducedPom>false</createDependencyReducedPom>
                        <transformers>
                            <transformer
                                implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
                            <transformer
                                implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                              <mainClass>org.apache.storm.flux.Flux</mainClass>
                              <manifestEntries>
                                <Change></Change>
                                <Build-Date></Build-Date>
                              </manifestEntries>
                            </transformer>
                        </transformers>
                        <!-- The filters below are necessary if you want to include the Tika
                            module -->
                        <filters>
                            <filter>
                                <artifact>*:*</artifact>
                                <excludes>
                                    <exclude>META-INF/*.SF</exclude>
                                    <exclude>META-INF/*.DSA</exclude>
                                    <exclude>META-INF/*.RSA</exclude>
                                </excludes>
                            </filter>
                        </filters>
                    </configuration>
                </execution>
            </executions>
        </plugin>
    </plugins>
</build>

<dependencies>
    <dependency>
        <groupId>org.apache.storm</groupId>
        <artifactId>storm-core</artifactId>
        <version>1.1.1</version>
        <scope>provided</scope>
    </dependency>
    <dependency>
        <groupId>org.apache.storm</groupId>
        <artifactId>flux-core</artifactId>
        <version>1.0.2</version>
    </dependency>
    <dependency>
        <groupId>com.digitalpebble.stormcrawler</groupId>
        <artifactId>storm-crawler-core</artifactId>
        <version>1.7</version>
    </dependency>
    <dependency>
        <groupId>com.digitalpebble.stormcrawler</groupId>
        <artifactId>storm-crawler-elasticsearch</artifactId>
        <version>1.7</version>
    </dependency>
    <dependency>
        <groupId>com.digitalpebble.stormcrawler</groupId>
        <artifactId>storm-crawler-tika</artifactId>
        <version>1.7</version>
    </dependency>
</dependencies>


Solution

  • Your pom and flux files look ok. You could put the injection as part of the main flux to keep things simple.

    What's in crawler-conf.yaml? Did you prefix the field names with 'parse.'?

    Here is the metadata extracted from the URL you posted above

    parse.dcterms:modified: 2004-09-29T20:21:18Z
    parse.pdf:PDFVersion: 1.4
    parse.access_permission:can_print: true
    parse.pdf:docinfo:subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
    parse.pdf:docinfo:modified: 2004-09-29T20:21:18Z
    parse.access_permission:extract_for_accessibility: true
    parse.created: Fri Sep 24 15:56:30 BST 2004
    parse.pdf:docinfo:created: 2004-09-24T14:56:30Z
    parse.xmpTPg:NPages: 7
    parse.access_permission:fill_in_form: true
    parse.producer: Adobe PDF Library 6.0
    parse.pdf:docinfo:title: About Metadata
    parse.pdf:docinfo:producer: Adobe PDF Library 6.0
    parse.dc:format: application/pdf; version=1.4
    parse.access_permission:assemble_document: true
    parse.access_permission:modify_annotations: true
    parse.dc:title: About Metadata
    parse.access_permission:can_print_degraded: true
    parse.xmpMM:DocumentID: adobe:docid:indd:de7d50b0-0fc1-11d9-b0d4-cd42e793ca90
    parse.xmpMM:DerivedFrom:DocumentID: adobe:docid:indd:a04d199f-0f11-11d9-b74d-bb0abf4f1ab0
    parse.title: About Metadata
    parse.Creation-Date: 2004-09-24T14:56:30Z
    parse.modified: 2004-09-29T20:21:18Z
    parse.resourceName: /digitalimag/pdfs/about_metadata.pdf
    parse.dc:description: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
    parse.Last-Save-Date: 2004-09-29T20:21:18Z
    parse.creator: Adobe Systems Incorporated
    parse.pdf:encrypted: false
    parse.trapped: False
    parse.pdf:docinfo:creator: Adobe Systems Incorporated
    parse.date: 2004-09-29T20:21:18Z
    parse.meta:save-date: 2004-09-29T20:21:18Z
    parse.Author: Adobe Systems Incorporated
    parse.X-Parsed-By: org.apache.tika.parser.DefaultParser
    parse.X-Parsed-By: org.apache.tika.parser.pdf.PDFParser
    parse.pdf:docinfo:creator_tool: Adobe InDesign CS (3.0.1)
    parse.dcterms:created: 2004-09-24T14:56:30Z
    parse.access_permission:can_modify: true
    parse.subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
    parse.meta:author: Adobe Systems Incorporated
    parse.access_permission:extract_content: true
    parse.xmp:CreatorTool: Adobe InDesign CS (3.0.1)
    parse.dc:creator: Adobe Systems Incorporated
    parse.cp:subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
    parse.pdf:docinfo:trapped: False
    parse.meta:creation-date: 2004-09-24T14:56:30Z
    parse.xmpMM:DerivedFrom:InstanceID: de7d50af-0fc1-11d9-b0d4-cd42e793ca90
    parse.Last-Modified: 2004-09-29T20:21:18Z
    parse.Content-Type: application/pdf
    parse.description: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
    

    Your conf should contain something like

      indexer.md.mapping:
      - parse.title=title
      - parse.Author=author
    

    As you can guess from the code of the test case, you need to add the file in external/tika/src/test/resources/ and refer to the name of the file in the test code, as with about_metadata.pdf in the example below

     @Test
    public void testMetadata() throws IOException {
    
        bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(),
                new OutputCollector(output));
    
        parse("https://www.adobe.com/digitalimag/pdfs/about_metadata.pdf",
                "about_metadata.pdf");
    
        List<List<Object>> outTuples = output.getEmitted();
    
        // single document
        Assert.assertEquals(1, outTuples.size());
        // metadata
        Metadata md = (Metadata) outTuples.get(0).get(2);
        Assert.assertTrue(
                md.getFirstValue("parse.pdf:docinfo:subject").contains(
                        "By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient."));
    
    }
    

    UPDATE

    on closer inspection, the problem comes from your flux. The redirection bolt sends the tuple to Tika on a bespoke stream named 'tika'. The definition should therefore be

    from: "redirection_bolt"
    to: "parser_bolt"
    grouping:
      type: LOCAL_OR_SHUFFLE
      streamId: "tika"