可能是因为PDFParser的版本与pdfbox依赖的版本不兼容造成的问题。可以尝试降低pdfbox的版本,或者升级Tika的版本来解决此问题。
代码示例:
Maven依赖:
org.apache.tika
tika-parsers
1.24.1
org.apache.pdfbox
pdfbox
2.0.24
Java代码:
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
public class TikaPDFParserExample {
public static void main(String[] args) throws IOException, SAXException, TikaException {
// Creating a PDF Parser object
Parser parser = new PDFParser();
// Creating a PDF file object
File file = new File("example.pdf");
// Creating a metadata object
Metadata metadata = new Metadata();
// Creating a FileInputStream object
FileInputStream inputStream = new FileInputStream(file);
// Creating a ParseContext object
ParseContext context = new ParseContext();
// Parsing the PDF document with PDFParser processor
parser.parse(inputStream, new BodyContentHandler(), metadata, context);
// Printing the extracted text
System.out.println("Extracted Content: " + new BodyContentHandler().toString());
}
}