Я пытаюсь разобрать файл .do c, используя Apache Tika, который содержит греческие символы, такие как alpha, beta, gamma, и результат tika полностью отличается от того, что я ожидал, я использую ниже код для синтаксического анализа .do c файл
FileInputStream fileInputStream = new FileInputStream();
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(-1);
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
Metadata metadatafromtika = new Metadata();
metadatafromtika.add(Metadata.CONTENT_ENCODING,"UTF-8");
parser.parse(fileInputStream, handler, metadatafromtika, parseContext);
String text = handler.toString();
Я использую кодировку UTF-8 в строке
metadatafromtika.add(Metadata.CONTENT_ENCODING,"UTF-8");
, а ниже приведены зависимости, которые я использую
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.18</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.9.1</version>
</dependency>
<dependency>
<groupId>org.antlr</groupId>
<artifactId>ST4</artifactId>
<version>4.0.8</version>
</dependency>
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>42.1.4</version>
</dependency>
<dependency>
<groupId>com.zaxxer</groupId>
<artifactId>HikariCP</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>commons-dbutils</groupId>
<artifactId>commons-dbutils</artifactId>
<version>1.6</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20171018</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>1.1.0-cdh5.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0-cdh5.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.0-cdh5.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.6.0-cdh5.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-tools</artifactId>
<version>2.6.0-mr1-cdh5.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.htrace</groupId>
<artifactId>htrace-core4</artifactId>
<version>4.0.1-incubating</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.1</version>
</dependency>
<dependency>
<groupId>com.levigo.jbig2</groupId>
<artifactId>levigo-jbig2-imageio</artifactId>
<version>1.6.5</version>
</dependency>
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-core</artifactId>
<version>1.3.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.9.5</version>
</dependency>
содержание в текстовом документе
the output which I get when I use the above tika code is
Is UTF-8 encoding not suitable for parsing greek characters using Apache Tika? or Am I missing anything in the code ?
Thanks in advance
EDIT:here is the complete java code which I am using
import org.apache.commons.io.FileUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import java.io.File;
import java.io.FileInputStream;
import java.nio.charset.StandardCharsets;
public class Tika {
public static void main(String[] args) {
try {
String inputPath = args[0];
String outputPath = args[1];
File f = new File(inputPath);
System.out.println("path is : " + f.getAbsoluteFile());
FileInputStream fileInputStream = new FileInputStream(f);
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(-1);
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
Metadata metadatafromtika = new Metadata();
metadatafromtika.add(Metadata.CONTENT_ENCODING, "UTF-8");
parser.parse(fileInputStream, handler, metadatafromtika, parseContext);
String text = handler.toString();
System.out.println("done parsing for file : " + f.getAbsolutePath());
System.out.println("text is : \n" + text);
byte[] bytes = text.getBytes();
String encodedText = new String(bytes, StandardCharsets.UTF_8);
System.out.println("encoded text is : " + encodedText);
FileUtils.writeStringToFile(new File(outputPath + File.separator + f.getName() + "_content.txt"),
text, "UTF-8");
}
catch (Exception e) {
e.printStackTrace();
}
}
}
EDIT 2 : Below is the code which uses PrintWriter
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintWriter;
public class TikaTmp {
public static void main(String[] args) {
FileInputStream fileInputStream = null;
try {
String inputPath = args[0];
String outputPath = args[1];
File f = new File(inputPath);
System.out.println("path is : " + f.getAbsoluteFile());
fileInputStream = new FileInputStream(f);
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(-1);
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
Metadata metadatafromtika = new Metadata();
metadatafromtika.add(Metadata.CONTENT_ENCODING, "UTF-8");
parser.parse(fileInputStream, handler, metadatafromtika, parseContext);
PrintWriter printWriter = new PrintWriter(new File(
output_path + File.separator + f.getName() +"_content.txt"),"UTF-8");
printWriter.write(String.valueOf(handler));
printWriter.flush();
printWriter.close();
}
catch (Exception e) {
e.printStackTrace();
}
finally {
try {
if(fileInputStream!=null)
fileInputStream.close();
}
catch (IOException e) {
e.printStackTrace();
}
}
}
}
EDIT 3: the characters which I am truing to parse are from the symbol font which microsoft word uses,Tika fails only for the characters from the symbol font
введите описание изображения здесь
Я предполагаю, что это не настоящие греческие символы, а выглядят как греческие символы