Tika Parser не может разбирать греческие символы - PullRequest
1 голос
/ 07 августа 2020

Я пытаюсь разобрать файл .do c, используя Apache Tika, который содержит греческие символы, такие как alpha, beta, gamma, и результат tika полностью отличается от того, что я ожидал, я использую ниже код для синтаксического анализа .do c файл

FileInputStream fileInputStream = new FileInputStream();
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(-1);
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
Metadata metadatafromtika = new Metadata();
metadatafromtika.add(Metadata.CONTENT_ENCODING,"UTF-8");
parser.parse(fileInputStream, handler, metadatafromtika, parseContext);
            String text = handler.toString();

Я использую кодировку UTF-8 в строке

metadatafromtika.add(Metadata.CONTENT_ENCODING,"UTF-8");

, а ниже приведены зависимости, которые я использую

<dependencies>
<dependency>
  <groupId>org.apache.tika</groupId>
  <artifactId>tika-parsers</artifactId>
  <version>1.18</version>
</dependency>

<dependency>
  <groupId>commons-collections</groupId>
  <artifactId>commons-collections</artifactId>
  <version>3.2.1</version>
</dependency>

<dependency>
  <groupId>org.apache.logging.log4j</groupId>
  <artifactId>log4j-core</artifactId>
  <version>2.9.1</version>
</dependency>

<dependency>
  <groupId>org.antlr</groupId>
  <artifactId>ST4</artifactId>
  <version>4.0.8</version>
</dependency>

<dependency>
  <groupId>org.postgresql</groupId>
  <artifactId>postgresql</artifactId>
  <version>42.1.4</version>
</dependency>

<dependency>
  <groupId>com.zaxxer</groupId>
  <artifactId>HikariCP</artifactId>
  <version>2.7.2</version>
</dependency>

<dependency>
  <groupId>commons-dbutils</groupId>
  <artifactId>commons-dbutils</artifactId>
  <version>1.6</version>
</dependency>

<dependency>
  <groupId>commons-io</groupId>
  <artifactId>commons-io</artifactId>
  <version>2.5</version>
</dependency>

<dependency>
  <groupId>org.json</groupId>
  <artifactId>json</artifactId>
  <version>20171018</version>
</dependency>

<dependency>
  <groupId>org.apache.hive</groupId>
  <artifactId>hive-jdbc</artifactId>
  <version>1.1.0-cdh5.10.1</version>
</dependency>

<dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-common</artifactId>
  <version>2.6.0-cdh5.10.1</version>
</dependency>

<dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-hdfs</artifactId>
  <version>2.6.0-cdh5.10.1</version>
</dependency>

<dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-mapreduce-client-core</artifactId>
  <version>2.6.0-cdh5.10.1</version>
</dependency>

<dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-tools</artifactId>
  <version>2.6.0-mr1-cdh5.10.1</version>
</dependency>

<dependency>
  <groupId>org.apache.htrace</groupId>
  <artifactId>htrace-core4</artifactId>
  <version>4.0.1-incubating</version>
</dependency>

<dependency>
  <groupId>com.google.code.gson</groupId>
  <artifactId>gson</artifactId>
  <version>2.8.1</version>
</dependency>

<dependency>
  <groupId>com.levigo.jbig2</groupId>
  <artifactId>levigo-jbig2-imageio</artifactId>
  <version>1.6.5</version>
</dependency>

<dependency>
  <groupId>com.github.jai-imageio</groupId>
  <artifactId>jai-imageio-core</artifactId>
  <version>1.3.1</version>
</dependency>

<dependency>
  <groupId>com.fasterxml.jackson.core</groupId>
  <artifactId>jackson-core</artifactId>
  <version>2.9.5</version>
</dependency>

содержание в текстовом документе

enter image description here

the output which I get when I use the above tika code is

enter image description here

Is UTF-8 encoding not suitable for parsing greek characters using Apache Tika? or Am I missing anything in the code ?

Thanks in advance

EDIT:here is the complete java code which I am using

      import org.apache.commons.io.FileUtils;
      import org.apache.tika.metadata.Metadata;
      import org.apache.tika.parser.AutoDetectParser;
      import org.apache.tika.parser.ParseContext;
      import org.apache.tika.parser.Parser;
      import org.apache.tika.sax.BodyContentHandler;
      import java.io.File;
      import java.io.FileInputStream;
      import java.nio.charset.StandardCharsets;


    public class Tika {

    public static void main(String[] args) {
        try {
            String inputPath = args[0];
            String outputPath = args[1];
            File f = new File(inputPath);
            System.out.println("path is : " + f.getAbsoluteFile());
            FileInputStream fileInputStream = new FileInputStream(f);
            Parser parser = new AutoDetectParser();
            BodyContentHandler handler = new BodyContentHandler(-1);
            ParseContext parseContext = new ParseContext();
            parseContext.set(Parser.class, parser);
            Metadata metadatafromtika = new Metadata();
            metadatafromtika.add(Metadata.CONTENT_ENCODING, "UTF-8");
            parser.parse(fileInputStream, handler, metadatafromtika, parseContext);
            String text = handler.toString();

            System.out.println("done parsing for file : " + f.getAbsolutePath());
            System.out.println("text is : \n" + text);

            byte[] bytes = text.getBytes();
            String encodedText = new String(bytes, StandardCharsets.UTF_8);
            System.out.println("encoded text is : " + encodedText);

            FileUtils.writeStringToFile(new File(outputPath + File.separator + f.getName() + "_content.txt"),
                text, "UTF-8");
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }
}

EDIT 2 : Below is the code which uses PrintWriter

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintWriter;


public class TikaTmp {

    public static void main(String[] args) {
        FileInputStream fileInputStream = null;
        try {
            String inputPath = args[0];
            String outputPath = args[1];
            File f = new File(inputPath);
            System.out.println("path is : " + f.getAbsoluteFile());
            fileInputStream = new FileInputStream(f);
            Parser parser = new AutoDetectParser();
            BodyContentHandler handler = new BodyContentHandler(-1);
            ParseContext parseContext = new ParseContext();
            parseContext.set(Parser.class, parser);
            Metadata metadatafromtika = new Metadata();
            metadatafromtika.add(Metadata.CONTENT_ENCODING, "UTF-8");
            parser.parse(fileInputStream, handler, metadatafromtika, parseContext);
            PrintWriter printWriter = new PrintWriter(new File(
                output_path + File.separator + f.getName() +"_content.txt"),"UTF-8");
            printWriter.write(String.valueOf(handler));
            printWriter.flush();
            printWriter.close();

        }
        catch (Exception e) {
            e.printStackTrace();
        }
        finally {
            try {
                if(fileInputStream!=null)
                    fileInputStream.close();
            }
            catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

EDIT 3: the characters which I am truing to parse are from the symbol font which microsoft word uses,Tika fails only for the characters from the symbol font

введите описание изображения здесь

Я предполагаю, что это не настоящие греческие символы, а выглядят как греческие символы

...