Question

Я пытаюсь извлечь изображение, связанное с текстом, в файле PDF.Например, в PDF будет фотография фасада дома.Прямо над фотографией будет надпись «Вид спереди».Я хочу, чтобы программа искала в PDF-файле текст «Вид спереди» и извлекла фотографию, которая следует за ним.

Я просматривал iTextSharp, PDFsharp и другие утилиты, но все они обрабатывают текст в видеPDF и изображения отдельно.Кажется, нет никакого способа выяснить, что эта строка текста предшествует этому изображению.

Мы используем iTextSharp для манипулирования PDF-файлами.Я написал метод на C #, который будет извлекать изображение с учетом номера страницы, номера изображения на странице и типа изображения.Например, я могу извлечь 2-й JPEG на странице 3. Вот код для этого.Я хотел бы иметь возможность искать строку текста в файле и затем извлекать изображение, следующее за этой строкой текста.

public class ImageExtractor : IRenderListener
{
    int _currentPage = 1;
    int _imageCount = 0;
    int _index = 0;
    int _count = 0;
    readonly string _outputFilePrefix;
    readonly string _outputFolder;
    readonly bool _overwriteExistingFiles;
    string[] _fileTypes;

    public ImageExtractor(string outputFilePrefix, string outputFolder, bool overwriteExistingFiles, string[] fileTypes, int index)
    {
        _outputFilePrefix = outputFilePrefix;
        _outputFolder = outputFolder;
        _overwriteExistingFiles = overwriteExistingFiles;
        _fileTypes = fileTypes;
        _index = index;
    }

    public static int ExtractImageByIndex(string pdfPath, string outputFilePrefix, string outputFolder, bool overwriteExistingFiles, int pageNumber, int index, string[] fileTypes = null)
    {
        // Handle setting of any default values
        outputFilePrefix = outputFilePrefix ?? System.IO.Path.GetFileNameWithoutExtension(pdfPath);
        outputFolder = String.IsNullOrEmpty(outputFolder) ? System.IO.Path.GetDirectoryName(pdfPath) : outputFolder;

        var instance = new ImageExtractor(outputFilePrefix, outputFolder, overwriteExistingFiles, fileTypes, index);
        instance._currentPage = pageNumber;

        using (var pdfReader = new PdfReader(pdfPath))
        {
            if (pdfReader.NumberOfPages == 0)
                return 0;

            if (pdfReader.IsEncrypted())
                throw new ApplicationException(pdfPath + " is encrypted.");

            var pdfParser = new PdfReaderContentParser(pdfReader);

            pdfParser.ProcessContent(instance._currentPage, instance);
        }

        return instance._imageCount;
    }

    public void BeginTextBlock() { }
    public void EndTextBlock() { }
    public void RenderText(TextRenderInfo renderInfo) { }

    public void RenderImage(ImageRenderInfo renderInfo)
    {
        // If _index is greater than 0, we're looking for a specific image. If _count is
        // equal to _index, we've already found it, so don't go any farther.
        if (_index > 0 && _count == _index)
            return;

        var imageObject = renderInfo.GetImage();

        var imageFileName = "";

        if (_fileTypes != null)
        {
            var type = imageObject.GetFileType().ToLower();
            var flag = false;
            foreach (var t in _fileTypes)
            {
                if (t.ToLower() == type)
                {
                    flag = true;
                    break;
                }
            }
            if (flag)
                imageFileName = String.Format("{0}_{1}_{2}.{3}", _outputFilePrefix, _currentPage, _imageCount, imageObject.GetFileType());
        }
        else
        {
            imageFileName = String.Format("{0}_{1}_{2}.{3}", _outputFilePrefix, _currentPage, _imageCount, imageObject.GetFileType());
        }

        if (!string.IsNullOrEmpty(imageFileName))
        {
            // If _index is 0, multiple images may be extracted. If _index is greater than 0,
            // RenderImage will increment count every time it finds an image that matches the
            // file type and will only extract the image if count equals index.
            if (_index > 0)
            {
                _count++;
                if (_count != _index)
                    return;
            }

            var imagePath = System.IO.Path.Combine(_outputFolder, imageFileName);

            if (_overwriteExistingFiles || !File.Exists(imagePath))
            {
                var imageRawBytes = imageObject.GetImageAsBytes();

                File.WriteAllBytes(imagePath, imageRawBytes);

            }

            // Subtle: Always increment even if file is not written. This ensures consistency should only some
            //   of a PDF file's images actually exist.
            _imageCount++;
        }
    }
}

mkl · Answer 1 · 18 июня 2019

Как уже упоминалось в комментарии, это очень похоже на тему вопроса Извлечение изображений, присутствующих в абзаце , с основным отличием в том, что в контексте этого вопроса вместо этого использовался iText для JavaiTextSharp для .Net.

Порт Java SimpleMixedExtractionStrategy из этого вопроса может выглядеть так:

public class SimpleMixedExtractionStrategy : LocationTextExtractionStrategy
{
    FieldInfo field = typeof(LocationTextExtractionStrategy).GetField("locationalResult", BindingFlags.Instance | BindingFlags.NonPublic);
    LineSegment UNIT_LINE = new LineSegment(new Vector(0, 0, 1), new Vector(1, 0, 1));
    String outputPath;
    String name;
    int counter = 0;

    public SimpleMixedExtractionStrategy(String outputPath, String name)
    {
        this.outputPath = outputPath;
        this.name = name;
    }

    public override void RenderImage(ImageRenderInfo renderInfo)
    {
        PdfImageObject image = renderInfo.GetImage();
        if (image == null) return;
        int number = counter++;
        String filename = name + "-" + number + "." + image.GetFileType();
        File.WriteAllBytes(outputPath + filename, image.GetImageAsBytes());

        LineSegment segment = UNIT_LINE.TransformBy(renderInfo.GetImageCTM());
        TextChunk location = new TextChunk("[" + filename + "]", segment.GetStartPoint(), segment.GetEndPoint(), 0f);

        List<TextChunk> locationalResult = (List<TextChunk>)field.GetValue(this);
        locationalResult.Add(location);
    }
}

Так же, как в реализации Java, это необходимоиспользовать отражение для доступа к private List<TextChunk> locationalResult in LocationTextExtractionStrategy.Если использование отражения не разрешено в вашем проекте, вы можете скопировать весь источник LocationTextExtractionStrategy в собственный класс и применить изменения к копии.

Вы можете использовать его следующим образом:

String sourceFile = @"SOURCE.pdf";
String imagePath = @"extract\";
String imageBaseName = "SOURCE-";
Directory.CreateDirectory(imagePath);

using (PdfReader pdfReader = new PdfReader(sourceFile))
{
    PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);
    for (var i = 1; i <= pdfReader.NumberOfPages; i++)
    {
        SimpleMixedExtractionStrategy listener = new SimpleMixedExtractionStrategy(imagePath, imageBaseName + i);
        parser.ProcessContent(i, listener);
        String text = listener.GetResultantText();
        Console.Write("Text of page {0}:\n---\n{1}\n\n", i, text);
    }
}

Для файла примера из упомянутого вопроса

вывод:

Text of page 1:
---
Getting Started with Vaadin
• A version of Book of Vaadin that you can browse in the Eclipse Help system.
You can install the plugin as follows:
1. Start Eclipse.
2. Select Help   Software Updates....
3. Select the Available Software tab.
4. Add the Vaadin plugin update site by clicking Add Site....
[book-of-vaadin-page14-1-0.png]
Enter the URL of the Vaadin Update Site: http://vaadin.com/eclipse and click OK. The
Vaadin site should now appear in the Software Updates window.
5. Select all the Vaadin plugins in the tree.
[book-of-vaadin-page14-1-1.png]
Finally, click Install.
Detailed and up-to-date installation instructions for the Eclipse plugin can be found at http://vaad-
in.com/eclipse.
Updating the Vaadin Plugin
If you have automatic updates enabled in Eclipse (see Window   Preferences   Install/Update
  Automatic Updates), the Vaadin plugin will be updated automatically along with other plugins.
Otherwise, you can update the Vaadin plugin (there are actually multiple plugins) manually as
follows:
1. Select Help   Software Updates..., the Software Updates and Add-ons window will
open.
2. Select the Installed Software tab.
14 Vaadin Plugin for Eclipse

Таким образом, дляВаша задача

Мне хотелось бы найти строку текста в файле и затем извлечь изображение, следующее за этой строкой текста.

просто найдите эту строку текста в выходной строке выше и найдите следующую строку, содержащую имя файла изображения в квадратных скобках.

(Если ваш PDF-файл также использует квадратные скобки, вы можетезаключите имя файла в другие разделители в SimpleMixedExtractionStrategy, например, некоторые символы из области частного использования Unicode.)

Tim · Answer 2 · 19 июня 2019

Вот решение, которое я нашел. Оригинальный код содержит много материала, который не имеет прямого отношения к вопросу, поэтому я упростил его для поста.

public class ImageExtractor : IRenderListener
{
    private string caption;
    private bool _captionFound;
    private string _outputFolder;

    ....
    ....

    public void BeginTextBlock() { }

    public void EndTextBlock() { }

    public void RenderText(TextRenderInfo renderInfo) {
        // If this line of text contains the caption, set _captionFound to true
        if (renderInfo.GetText().Contains(_caption))
            _captionFound = true;
    }

    public void RenderImage(ImageRenderInfo renderInfo)
    {
        // Skip the image if _captionFound is false
        if (!_captionFound)
            return;

        // _captionFound is true, so extract the image

        // Code to extract image

        // Set _captionFound back to false, so that only the first image found is
        // extracted.
        _captionFound = false;

    }

    public static int ExtractImageByCaption(string caption, string pdfPath, string outputFolder, string outputFolder, bool overwriteExistingFiles, string[] fileTypes = null)
    {
        var instance = new ImageExtractor(outputFilePrefix, outputFolder, overwriteExistingFiles, fileTypes, 0);

        instance._caption = caption;
        instance._outputFolder = outputFolder;

        using (var pdfReader = new PdfReader(pdfPath))
        {
            if (pdfReader.IsEncrypted())
                throw new ApplicationException(pdfPath + " is encrypted.");

            var pdfParser = new PdfReaderContentParser(pdfReader);

            while (instance._currentPage <= pdfReader.NumberOfPages)
            {
                pdfParser.ProcessContent(instance._currentPage, instance);

                instance._currentPage++;
            }
        }
    }
}

Получить текст предыдущего изображения в PDF

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Получить текст предыдущего изображения в PDF

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Нет похожих вопросов