Я пытаюсь проиндексировать PDF-файл с помощью Solr 6 и хочу извлечь и сохранить изображение (если есть) в каком-то месте.Я использую приведенную ниже конфигурацию, но не могу извлечь изображение.Я успешно проиндексировал содержание текста в формате PDF.
schema.xml
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="Breast-Cancer_PDFSchema" version="1.6">
<uniqueKey>id</uniqueKey>
<field name="id" type="strings" multiValued="false" indexed="true" required="true" stored="true"/>
<field name="_version_" type="long" indexed="true" stored="true"/>
<field name="date" type="tdates" indexed="true" stored="true"/>
<field name="pdf_pdfversion" type="strings" indexed="true" stored="true"/>
<field name="stream_content_type" type="strings" indexed="true" stored="true"/>
<field name="access_permission_modify_annotations" type="strings" indexed="true" stored="true"/>
<field name="access_permission_can_print_degraded" type="strings" indexed="true" stored="true"/>
<field name="dcterms_created" type="strings" indexed="true" stored="true"/>
<field name="last_modified" type="strings" indexed="true" stored="true"/>
<field name="dcterms_modified" type="strings" indexed="true" stored="true"/>
<field name="dc_format" type="strings" indexed="true" stored="true"/>
<field name="last_save_date" type="strings" indexed="true" stored="true"/>
<field name="access_permission_fill_in_form" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_modified" type="strings" indexed="true" stored="true"/>
<field name="stream_name" type="strings" indexed="true" stored="true"/>
<field name="meta_save_date" type="strings" indexed="true" stored="true"/>
<field name="pdf_encrypted" type="strings" indexed="true" stored="true"/>
<field name="modified" type="strings" indexed="true" stored="true"/>
<field name="content_type" type="strings" indexed="true" stored="true"/>
<field name="stream_size" type="strings" indexed="true" stored="true"/>
<field name="x_parsed_by" type="strings" indexed="true" stored="true"/>
<field name="meta_creation_date" type="strings" indexed="true" stored="true"/>
<field name="stream_source_info" type="strings" indexed="true" stored="true"/>
<field name="created" type="strings" indexed="true" stored="true"/>
<field name="access_permission_extract_for_accessibility" type="strings" indexed="true" stored="true"/>
<field name="access_permission_assemble_document" type="strings" indexed="true" stored="true"/>
<field name="xmptpg_npages" type="strings" indexed="true" stored="true"/>
<field name="creation_date" type="strings" indexed="true" stored="true"/>
<field name="access_permission_extract_content" type="strings" indexed="true" stored="true"/>
<field name="access_permission_can_print" type="strings" indexed="true" stored="true"/>
<field name="producer" type="strings" indexed="true" stored="true"/>
<field name="subject" type="strings" indexed="true" stored="true"/>
<field name="dc_creator" type="strings" indexed="true" stored="true"/>
<field name="aapl_keywords" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_producer" type="strings" indexed="true" stored="true"/>
<field name="resourcename" type="strings" indexed="true" stored="true"/>
<field name="access_permission_can_modify" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_created" type="strings" indexed="true" stored="true"/>
<field name="_text_" type="strings" indexed="true" stored="true"/>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
<fieldType name="strings" class="solr.TextField" sortMissingLast="true" multiValued="true" />
<fieldType name="long" class="solr.TrieLongField" positionIncrementGap="0" precisionStep="0"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
<fieldType name="tdates" class="solr.TrieDateField" positionIncrementGap="0" multiValued="true" precisionStep="6"/>
<fieldType name="tlongs" class="solr.TrieLongField" positionIncrementGap="0" multiValued="true" precisionStep="8"/>
<fieldType name="tdoubles" class="solr.TrieDoubleField" positionIncrementGap="0" multiValued="true" precisionStep="8"/>
solr-config.xml
<requestHandler name="/update/extract" startup="lazy" class="org.apache.solr.handler.extraction.ExtractingRequestHandler" >
<entries>
<entry class="org.apache.tika.parser.pdf.AutoDetectParser"> </entry>
</entries>
<lst name="defaults">
<str name="lowernames">true</str>
<str name="fmap.meta">ignored_</str>
<str name="fmap.content">_text_</str>
<str name="fmap.id">id</str>
</lst>
</requestHandler>
Я следовалофициальная документация apache solr, но внесенные в solr-config.xml изменения в соответствии с ними, по-прежнему имеют ту же проблему.