Stormcrawler: Apache Tika для анализа свойств PDF - PullRequest
0 голосов
/ 11 мая 2018

Я добавил Tika в качестве ссылки на мою реализацию StormCrawler, и это позволяет извлекать документы PDF в сканере.Но Заголовок , Авторы и другие свойства не анализируются.Я попытался с различными комбинациями к ' index.md.mapping :' и добавил соответствующие свойства к ES_IndexInit , но поле содержимого в Kibana (index) для документов PDF всегда пусто,Все работает для HTML-страниц.Можете ли вы помочь с некоторыми указателями, если я что-то упустил, или я могу посмотреть пример?


es-crawler.flux:

name: "crawler"</p> <p>includes: - resource: true file: "/crawler-default.yaml" override: false</p> <pre><code>- resource: false file: "crawler-conf.yaml" override: true - resource: false file: "es-conf.yaml" override: true

spouts: - id: "spout" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout" параллелизм: 10

болтов: - id:"разделитель" className: "com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt" параллелизм: 1 - id: "fetcher" className: "com.digitalpebble.stormcrawler.bolt.FetcherBolt" параллелизм: 1 - id: "sitemap" className: "com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt "параллелизм: 1 - id:" parse "className:" com.digitalpebble.stormcrawler.bolt.JSoupParserBolt "параллелизм: 5 - id:" index "className:" comblestormb.linearsearch.bolt.IndexerBolt "параллелизм: 1 - id:" status "className:" com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt "параллелизм: 1 - id:" status_metrics "className:" com.digitalpebble.stormcrawler.elasticsearch.metrics.StatusMetricsBolt "параллелизм: 4 - id:" redirection_bolt "className:" com.digitalpebble.stormcrawler.tika.RedirectionBolt "параллелизм: 1 - id:" parser_bolt "className:" com.dormitalcblerParserBolt "параллелизм: 1

потоки: - от:" spout "до:" разделитель "группировка: тип: SHUFFLE

  • от:" spout "до:" status_metrics"группировка: тип: SHUFFLE

  • из:" разделитель "в:" сборщик "группировка: тип: FIELDS args: [" ключ "]

  • из: «fetcher» в: «sitemap» группировка: тип: LOCAL_OR_SHUFFLE

  • из: «sitemap» в: «parse» группировка: тип: LOCAL_OR_SHUFFLE

  • из: "parse" в: группировка "index": тип: LOCAL_OR_SHUFFLE

  • из: "fetcher" в: группировка "status": тип: FIELDS args:["url"] streamId: "status"

  • from: "sitemap" to: "status" группировка: тип: FIELDS args: ["url"] streamId: "status"

  • from: "parse" to: "status" grouping: type: FIELDS args: ["url"] streamId: "status"

  • from: "index "to:" status "группировка: тип: FIELDS args: [" url "] streamId:" status "

  • from:" parse "to:" redirection_bolt "grouping: type:LOCAL_OR_SHUFFLE

  • из: «redirection_bolt» в: «parser_bolt» группировка: тип: LOCAL_OR_SHUFFLE

  • из: «redirection_bolt» в: «index»группировка: тип: LOCAL_OR_SHUFFLE

  • от: "parser_bolt" до: "index" группировка: тип: LOCAL_OR_SHUFFLE

es-injector.flux: name: "injector"</p> <p>includes: - resource: true file: "/crawler-default.yaml" override: false</p> <pre><code>- resource: false file: "crawler-conf.yaml" override: true - resource: false file: "es-conf.yaml" override: true - resource: false file: "injection-conf.yaml" override: true

компоненты: - id: "схема" className: "com.digitalpebble.stormcrawler.util.StringTabScheme" constructorArgs: - ОТКРЫТО

spouts: - id: "spout" className: "com.digitalpebble.stormcrawler.spout.FileSpout" параллелизм: 1 constructorArgs: - "."- "seed.txt" - ref: "схема"

bolts: - id: "status" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt" параллелизм: 1 - id: "parser_bolt"имя_класса: "com.digitalpebble.stormcrawler.tika.ParserBolt" параллелизм: 1

потоки: - из: "носик" в: "статус"группировка: Тип: ПОЛЯ args: ["url"]

pom.xml: http://maven.apache.org/maven-v4_0_0.xsd"></p> <pre><code><modelVersion>4.0.0</modelVersion> <groupId>xyz.com</groupId> <artifactId>search</artifactId> <version>search1.0</version> <packaging>jar</packaging> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.2</version> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> <plugin> <groupId>org.codehaus.mojo</groupId> <artifactId>exec-maven-plugin</artifactId> <version>1.3.2</version> <executions> <execution> <goals> <goal>exec</goal> </goals> </execution> </executions> <configuration> <executable>java</executable> <includeProjectDependencies>true</includeProjectDependencies> <includePluginDependencies>false</includePluginDependencies> <classpathScope>compile</classpathScope> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>1.3.3</version> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <createDependencyReducedPom>false</createDependencyReducedPom> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" /> <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <mainClass>org.apache.storm.flux.Flux</mainClass> <manifestEntries> <Change></Change> <Build-Date></Build-Date> </manifestEntries> </transformer> </transformers> <!-- The filters below are necessary if you want to include the Tika module --> <filters> <filter> <artifact>*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> </configuration> </execution> </executions> </plugin> </plugins> </build> <dependencies> <dependency> <groupId>org.apache.storm</groupId> <artifactId>storm-core</artifactId> <version>1.1.1</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.storm</groupId> <artifactId>flux-core</artifactId> <version>1.0.2</version> </dependency> <dependency> <groupId>com.digitalpebble.stormcrawler</groupId> <artifactId>storm-crawler-core</artifactId> <version>1.7</version> </dependency> <dependency> <groupId>com.digitalpebble.stormcrawler</groupId> <artifactId>storm-crawler-elasticsearch</artifactId> <version>1.7</version> </dependency> <dependency> <groupId>com.digitalpebble.stormcrawler</groupId> <artifactId>storm-crawler-tika</artifactId> <version>1.7</version> </dependency> </dependencies>

1 Ответ

0 голосов
/ 14 мая 2018

Ваши файлы pom и flux выглядят нормально. Вы можете поместить инъекцию как часть основного потока, чтобы все было просто.

Что находится в crawler-conf.yaml? Вы добавили в префиксы имена полей 'parse.'?

Вот метаданные, извлеченные из URL, который вы разместили выше

parse.dcterms:modified: 2004-09-29T20:21:18Z
parse.pdf:PDFVersion: 1.4
parse.access_permission:can_print: true
parse.pdf:docinfo:subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
parse.pdf:docinfo:modified: 2004-09-29T20:21:18Z
parse.access_permission:extract_for_accessibility: true
parse.created: Fri Sep 24 15:56:30 BST 2004
parse.pdf:docinfo:created: 2004-09-24T14:56:30Z
parse.xmpTPg:NPages: 7
parse.access_permission:fill_in_form: true
parse.producer: Adobe PDF Library 6.0
parse.pdf:docinfo:title: About Metadata
parse.pdf:docinfo:producer: Adobe PDF Library 6.0
parse.dc:format: application/pdf; version=1.4
parse.access_permission:assemble_document: true
parse.access_permission:modify_annotations: true
parse.dc:title: About Metadata
parse.access_permission:can_print_degraded: true
parse.xmpMM:DocumentID: adobe:docid:indd:de7d50b0-0fc1-11d9-b0d4-cd42e793ca90
parse.xmpMM:DerivedFrom:DocumentID: adobe:docid:indd:a04d199f-0f11-11d9-b74d-bb0abf4f1ab0
parse.title: About Metadata
parse.Creation-Date: 2004-09-24T14:56:30Z
parse.modified: 2004-09-29T20:21:18Z
parse.resourceName: /digitalimag/pdfs/about_metadata.pdf
parse.dc:description: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
parse.Last-Save-Date: 2004-09-29T20:21:18Z
parse.creator: Adobe Systems Incorporated
parse.pdf:encrypted: false
parse.trapped: False
parse.pdf:docinfo:creator: Adobe Systems Incorporated
parse.date: 2004-09-29T20:21:18Z
parse.meta:save-date: 2004-09-29T20:21:18Z
parse.Author: Adobe Systems Incorporated
parse.X-Parsed-By: org.apache.tika.parser.DefaultParser
parse.X-Parsed-By: org.apache.tika.parser.pdf.PDFParser
parse.pdf:docinfo:creator_tool: Adobe InDesign CS (3.0.1)
parse.dcterms:created: 2004-09-24T14:56:30Z
parse.access_permission:can_modify: true
parse.subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
parse.meta:author: Adobe Systems Incorporated
parse.access_permission:extract_content: true
parse.xmp:CreatorTool: Adobe InDesign CS (3.0.1)
parse.dc:creator: Adobe Systems Incorporated
parse.cp:subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
parse.pdf:docinfo:trapped: False
parse.meta:creation-date: 2004-09-24T14:56:30Z
parse.xmpMM:DerivedFrom:InstanceID: de7d50af-0fc1-11d9-b0d4-cd42e793ca90
parse.Last-Modified: 2004-09-29T20:21:18Z
parse.Content-Type: application/pdf
parse.description: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 

Ваш конф должен содержать что-то вроде

  indexer.md.mapping:
  - parse.title=title
  - parse.Author=author

Как вы можете догадаться из кода тестового примера, вам нужно добавить файл в external / tika / src / test / resources / и сослаться на имя файла в тестовом коде , как с about_metadata.pdf в приведенном ниже примере

 @Test
public void testMetadata() throws IOException {

    bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(),
            new OutputCollector(output));

    parse("https://www.adobe.com/digitalimag/pdfs/about_metadata.pdf",
            "about_metadata.pdf");

    List<List<Object>> outTuples = output.getEmitted();

    // single document
    Assert.assertEquals(1, outTuples.size());
    // metadata
    Metadata md = (Metadata) outTuples.get(0).get(2);
    Assert.assertTrue(
            md.getFirstValue("parse.pdf:docinfo:subject").contains(
                    "By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient."));

}

UPDATE

при ближайшем рассмотрении проблема связана с вашим потоком. Болт перенаправления отправляет кортеж Тике по индивидуальному заказу под названием «тика». Поэтому определение должно быть

from: "redirection_bolt"
to: "parser_bolt"
grouping:
  type: LOCAL_OR_SHUFFLE
  streamId: "tika"
...