PDDocument document = PDDocument.load(file);
if( document.isEncrypted() )
{
document.setAllSecurityToBeRemoved(false);
}
PDFTextStripper stripper = new PDFTextStripper();
//stripper.setSortByPosition( true );
String text = stripper.getText(document);
System.out.println(text);
OutputStreamWriter writer =
new OutputStreamWriter(new FileOutputStream("C:\\preface.txt"), StandardCharsets.UTF_8);
writer.write(text);
writer.flush();
writer.close();
Я пытаюсь извлечь текст из файла PDF, закодированного с помощью Dejavu Sans Condensed и DejaVu Sans Condensed-Bold, но выдает ошибку, указанную ниже:
SEVERE: Could not read ToUnicode CMap in font DejaVuSansCondensed
java.io.IOException: Error: expected the end of a dictionary.
at org.apache.fontbox.cmap.CMapParser.parseNextToken(CMapParser.java:477)
at org.apache.fontbox.cmap.CMapParser.parse(CMapParser.java:112)
at org.apache.pdfbox.pdmodel.font.CMapManager.parseCMap(CMapManager.java:75)
at org.apache.pdfbox.pdmodel.font.PDFont.readCMap(PDFont.java:197)
at org.apache.pdfbox.pdmodel.font.PDFont.<init>(PDFont.java:137)
at org.apache.pdfbox.pdmodel.font.PDType0Font.<init>(PDType0Font.java:176)
at org.apache.pdfbox.pdmodel.font.PDFontFactory.createFont(PDFontFactory.java:83)
at org.apache.pdfbox.pdmodel.PDResources.getFont(PDResources.java:146)
at org.apache.pdfbox.contentstream.operator.text.SetFontAndSize.process(SetFontAndSize.java:60)
at org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:848)
at org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:503)
at org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:477)
at org.apache.pdfbox.contentstream.PDFStreamEngine.processPage(PDFStreamEngine.java:150)
at org.apache.pdfbox.text.LegacyPDFStreamEngine.processPage(LegacyPDFStreamEngine.java:139)
at org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:391)
at org.apache.pdfbox.text.PDFTextStripper.processPages(PDFTextStripper.java:319)
at org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:266)
at org.apache.pdfbox.text.PDFTextStripper.getText(PDFTextStripper.java:227)
at Library.main(Library.java:32)
Jun 03, 2018 1:30:59 AM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font DejaVuSansCondensed are not implemented in PDFBox and will be ignored
Jun 03, 2018 1:30:59 AM org.apache.pdfbox.pdmodel.font.PDType0Font toUnicode
WARNING: No Unicode mapping for CID+98 (98) in font DejaVuSansCondensed
Jun 03, 2018 1:30:59 AM org.apache.pdfbox.pdmodel.font.PDType0Font toUnicode
WARNING: No Unicode mapping for CID+105 (105) in font DejaVuSansCondensed
Я также обнаружил, что нетотображение Unicode для этого конкретного набора PDF-файлов.Пожалуйста, помогите с написанием отображения Unicode для этой программы
PS Я новичок в PDFBox вещь