diff --git a/pom.xml b/pom.xml
index 5210f91..db64daa 100644
--- a/pom.xml
+++ b/pom.xml
@@ -55,7 +55,7 @@
org.apache.pdfbox
pdfbox
- 2.0.23
+ 2.0.24
bundle
@@ -100,6 +100,32 @@
1.4.0
+
+ org.apache.poi
+ poi
+ 5.2.2
+
+
+ org.apache.poi
+ poi-ooxml
+ 5.2.2
+
+
+
+
+ org.apache.poi
+ poi-scratchpad
+ 5.2.2
+
+
+
+
+ net.coobird
+ thumbnailator
+ 0.4.20
+
+
+
diff --git a/src/main/java/com/docus/sw/fenpan/FenpanService.java b/src/main/java/com/docus/sw/fenpan/FenpanService.java
index e256993..bdaced5 100644
--- a/src/main/java/com/docus/sw/fenpan/FenpanService.java
+++ b/src/main/java/com/docus/sw/fenpan/FenpanService.java
@@ -2,6 +2,7 @@ package com.docus.sw.fenpan;
import com.alibaba.excel.util.FileUtils;
import com.docus.sw.Config;
+import com.docus.sw.word.PdfBoxUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.imaging.ImageInfo;
import org.apache.commons.imaging.ImageReadException;
@@ -86,6 +87,10 @@ public class FenpanService {
new FenpanService().fenpan(saveUrl, readUrl);
}
+ public List getDocument(Pieces piece){
+ return null;
+ }
+
public Map readFile(String readUrl) {
//读取文件夹。
@@ -99,54 +104,37 @@ public class FenpanService {
//根据文件类型
if (piece.getFileTypeEnum() == FileTypeEnum.WORD) {
//从word 直接提取图片
+ //提取图片为document ,然后用于后面判断
} else if (piece.getFileTypeEnum() == FileTypeEnum.PDF) {
- // 从pdf 提取图片
+ // 从pdf 提取图片,
+ //提取图片为document ,然后用于后面判断
+ File file = new File("temp");
+ if(!file.exists()){
+ file.mkdirs();
+ }
+
+ List documentList = new ArrayList<>();
+ try {
+ PdfBoxUtils.pdf2image(piece.getAbsolutePath(),file.getAbsolutePath());
+ File[] files = file.listFiles();
+ for(File pdfImg:files){
+ getDocumentList(documentList, pdfImg);
+ }
+ } catch (IOException e) {
+ log.error(e.getMessage(),e);
+ }
+ piece.put(documentList);
+ //删除对应的temp 文件
+ file.delete();
} else {
//是图片,直接从图片提取
List documentList = new ArrayList<>();
File sourceFile = new File(piece.getAbsolutePath());
File[] files = sourceFile.listFiles();
for (File file : files) {
- //非图片模式,跳过。
- if (!(file.getName().endsWith(".jpg") || file.getName().endsWith(".png")
- || file.getName().endsWith(".jpeg") || file.getName().endsWith(".tif")
- || file.getName().endsWith(".tiff")) || file.getName().endsWith(".jp2") || file.getName().endsWith(".jpm")|| file.getName().endsWith(".gif")) {
- continue;
- }
-
- if (file.getName().endsWith(".jp2") || file.getName().endsWith(".jpm")) {
- // 读取 JPEG 2000 图像文件
-
- try {
- BufferedImage image = ImageIO.read(file);
- int height = image.getHeight();
- int width = image.getWidth();
- Document document = new Document(width, height, 300);
- documentList.add(document);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
-
- } else {
- try {
- ImageInfo imageInfo = Imaging.getImageInfo(file);
- int height = imageInfo.getHeight();
- int width = imageInfo.getWidth();
- int physicalHeightDpi = imageInfo.getPhysicalHeightDpi();
- Document document = new Document(width, height, physicalHeightDpi);
- documentList.add(document);
- } catch (IOException e) {
- FileUtils.delete(file);
- throw new RuntimeException("非图片格式", e);
- } catch (ImageReadException e) {
- FileUtils.delete(file);
- throw new RuntimeException(e);
- } catch (IllegalArgumentException e) {
- FileUtils.delete(file);
- }
- }
+ getDocumentList(documentList, file);
}
piece.put(documentList);
}
@@ -204,6 +192,47 @@ public class FenpanService {
}
+ private static void getDocumentList(List documentList, File file) {
+ //非图片模式,跳过。
+ if (!(file.getName().endsWith(".jpg") || file.getName().endsWith(".png")
+ || file.getName().endsWith(".jpeg") || file.getName().endsWith(".tif")
+ || file.getName().endsWith(".tiff")) || file.getName().endsWith(".jp2") || file.getName().endsWith(".jpm")|| file.getName().endsWith(".gif")) {
+ return;
+ }
+
+ if (file.getName().endsWith(".jp2") || file.getName().endsWith(".jpm")) {
+ // 读取 JPEG 2000 图像文件
+
+ try {
+ BufferedImage image = ImageIO.read(file);
+ int height = image.getHeight();
+ int width = image.getWidth();
+ Document document = new Document(width, height, 300);
+ documentList.add(document);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ } else {
+ try {
+ ImageInfo imageInfo = Imaging.getImageInfo(file);
+ int height = imageInfo.getHeight();
+ int width = imageInfo.getWidth();
+ int physicalHeightDpi = imageInfo.getPhysicalHeightDpi();
+ Document document = new Document(width, height, physicalHeightDpi);
+ documentList.add(document);
+ } catch (IOException e) {
+ FileUtils.delete(file);
+ throw new RuntimeException("非图片格式", e);
+ } catch (ImageReadException e) {
+ FileUtils.delete(file);
+ throw new RuntimeException(e);
+ } catch (IllegalArgumentException e) {
+ FileUtils.delete(file);
+ }
+ }
+ }
+
private void findAllDir(String absolutePath, List allDirectory) {
File sourceFile = new File(absolutePath);
diff --git a/src/main/java/com/docus/sw/souyin/SuoyinService.java b/src/main/java/com/docus/sw/souyin/SuoyinService.java
index 539b418..240ac7b 100644
--- a/src/main/java/com/docus/sw/souyin/SuoyinService.java
+++ b/src/main/java/com/docus/sw/souyin/SuoyinService.java
@@ -70,6 +70,9 @@ public class SuoyinService {
List pieceList = new ArrayList<>();
File[] piecesFile = rollFile.listFiles();
for(File piece:piecesFile){
+
+ //区分word和pdf
+
List documentList = new ArrayList<>();
Pieces pieces = new Pieces(FileTypeEnum.JPG, piece.getAbsolutePath(), piece.getName());
for (File docfile : piece.listFiles()) {
diff --git a/src/main/java/com/docus/sw/word/GetPicsDocx.java b/src/main/java/com/docus/sw/word/GetPicsDocx.java
new file mode 100644
index 0000000..10e051b
--- /dev/null
+++ b/src/main/java/com/docus/sw/word/GetPicsDocx.java
@@ -0,0 +1,35 @@
+package com.docus.sw.word;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFPictureData;
+
+public class GetPicsDocx {
+ public static void main(String[] args) {
+ String path = "E:\\上海项目测试\\文档\\35.docx";
+ File file = new File(path);
+ try {
+ FileInputStream fis = new FileInputStream(file);
+ XWPFDocument document = new XWPFDocument(fis);
+ XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(document);
+ String text = xwpfWordExtractor.getText();
+ System.out.println(text);
+ List picList = document.getAllPictures();
+ for (XWPFPictureData pic : picList) {
+ System.out.println(pic.getPictureType() + File.separator + pic.suggestFileExtension() + File.separator + pic.getFileName());
+ byte[] bytev = pic.getData();
+ FileOutputStream fos = new FileOutputStream("E:\\上海项目测试\\docxImage\\" + pic.getFileName());
+ fos.write(bytev);
+ }
+ fis.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/docus/sw/word/PdfBoxUtils.java b/src/main/java/com/docus/sw/word/PdfBoxUtils.java
new file mode 100644
index 0000000..6df3485
--- /dev/null
+++ b/src/main/java/com/docus/sw/word/PdfBoxUtils.java
@@ -0,0 +1,192 @@
+package com.docus.sw.word;
+
+import net.coobird.thumbnailator.Thumbnails;
+import net.coobird.thumbnailator.filters.Canvas;
+import net.coobird.thumbnailator.geometry.Positions;
+import org.apache.pdfbox.contentstream.PDFStreamEngine;
+import org.apache.pdfbox.contentstream.operator.Operator;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.io.MemoryUsageSetting;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDPageTree;
+import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
+import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
+import org.apache.pdfbox.pdmodel.encryption.PDEncryption;
+import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
+import org.apache.pdfbox.pdmodel.encryption.StandardProtectionPolicy;
+import org.apache.pdfbox.pdmodel.encryption.StandardSecurityHandler;
+import org.apache.pdfbox.pdmodel.graphics.PDXObject;
+import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+
+import javax.imageio.ImageIO;
+import java.awt.*;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * pdfbox 加解密
+ */
+public class PdfBoxUtils {
+
+ private static final String vectorKey = "AesVKeyDocus@702";
+
+ /**
+ * 加密
+ *
+ * @param key
+ * @param sourceFilePath
+ * @param destFilePath
+ * @throws Exception
+ */
+ public static void encrypt(String key, String sourceFilePath, String destFilePath) throws Exception {
+
+ File file = new File(sourceFilePath);
+ PDDocument load = PDDocument.load(file);
+
+ AccessPermission permissions = new AccessPermission();
+ // 是否可以插入/删除/旋转页面
+ permissions.setCanAssembleDocument(false);
+ // 是否可以复制和提取内容
+ permissions.setCanExtractContent(false);
+
+ permissions.setCanExtractForAccessibility(false);
+ // 设置用户是否可以填写交互式表单字段(包括签名字段)
+ permissions.setCanFillInForm(false);
+ // 设置用户是否可以修改文档
+ permissions.setCanModify(false);
+ // 设置用户是否可以添加或修改文本注释并填写交互式表单字段,如果canModify()返回true,则创建或修改交互式表单字段(包括签名字段)。
+ permissions.setCanModifyAnnotations(false);
+ // 设置用户是否可以打印。
+ permissions.setCanPrint(false);
+ // 设置用户是否可以降级格式打印文档
+ permissions.setCanPrintDegraded(false);
+
+ StandardProtectionPolicy p = new StandardProtectionPolicy(key, key, permissions);
+ SecurityHandler sh = new StandardSecurityHandler(p);
+ sh.prepareDocumentForEncryption(load);
+ PDEncryption encryptionOptions = new PDEncryption();
+ encryptionOptions.setSecurityHandler(sh);
+
+ load.setEncryptionDictionary(encryptionOptions);
+ load.save(destFilePath);
+ load.close();
+ }
+
+ /**
+ * 解密
+ *
+ * @param key
+ * @param sourceFilePath
+ * @param destFilePath
+ * @throws InvalidPasswordException
+ * @throws IOException
+ */
+ public static void decrypt(String key, String sourceFilePath, String destFilePath) throws InvalidPasswordException, IOException {
+ File file = new File(sourceFilePath);
+ PDDocument load = PDDocument.load(file, key);
+ load.setAllSecurityToBeRemoved(true);
+ load.save(destFilePath);
+ load.close();
+ }
+
+
+ /**
+ * pdfbox pdf to image
+ */
+ public static class DefPdfToImageEngine extends PDFStreamEngine {
+ public int imageNum = 1;
+
+ private final String des;
+ private final String name;
+
+ public DefPdfToImageEngine(String name, String des) {
+ this.des = des;
+ this.name = name;
+ }
+
+ @Override
+ protected void processOperator(Operator operator, List operands) throws IOException {
+ String operatorName = operator.getName();
+ if ("Do".equals(operatorName)) {
+ COSName objName = (COSName) operands.get(0);
+ PDXObject pdxObject = this.getResources().getXObject(objName);
+ if (pdxObject instanceof PDImageXObject) {
+ PDImageXObject imageXObject = (PDImageXObject) pdxObject;
+ BufferedImage image = imageXObject.getImage();
+ ImageIO.write(image,"jpg",new File(this.des + this.name + "-" + this.imageNum + ".jpg"));
+ ++this.imageNum;
+ } else if (pdxObject instanceof PDFormXObject) {
+ PDFormXObject form = (PDFormXObject) pdxObject;
+ this.showForm(form);
+ }
+ } else {
+ super.processOperator(operator, operands);
+ }
+
+ }
+ }
+
+ /**
+ * pdf to image
+ *
+ * @param src
+ * @param des
+ * @throws IOException
+ */
+ public static void pdf2image(String src, String des) throws IOException {
+ File pdfFile = new File(src);
+ PDDocument load = PDDocument.load(pdfFile, MemoryUsageSetting.setupMixed(1024 * 1024));
+ DefPdfToImageEngine imageEngine = new DefPdfToImageEngine(pdfFile.getName().substring(0, pdfFile.getName().lastIndexOf(".")), des);
+ for (PDPage page : load.getPages()) {
+ imageEngine.processPage(page);
+ }
+ load.close();
+ }
+
+
+ /**
+ * 指定提取哪一页
+ *
+ * @param src
+ * @param des
+ * @param pageNum
+ * @throws IOException
+ */
+ public static void pdf2image(String src, String des, int pageNum) throws IOException {
+ File pdfFile = new File(src);
+ PDDocument load = PDDocument.load(pdfFile, MemoryUsageSetting.setupMixed(1024 * 1024));
+ DefPdfToImageEngine imageEngine = new DefPdfToImageEngine(pdfFile.getName().substring(0, pdfFile.getName().lastIndexOf(".")), des);
+
+ PDPageTree pages = load.getPages();
+
+ for (int i = 0; i < pages.getCount(); i++) {
+
+ if (pageNum == i + 1) {
+ imageEngine.processPage(pages.get(i));
+ break;
+ }
+ }
+
+ load.close();
+ }
+
+ public static void main(String[] args) throws Exception {
+ pdf2image("C:\\Users\\zhanghai\\Desktop\\桌面\\test\\a.pdf", "C:\\Users\\zhanghai\\Desktop\\桌面\\test\\");
+
+
+// long a1 = System.currentTimeMillis();
+// encrypt(vectorKey, "D:\\workspace\\docus\\业务梳理\\测试数据\\伟淞\\98m.pdf", "D:\\workspace\\docus\\业务梳理\\测试数据\\伟淞\\98m-encrypt.pdf");
+// long b1 = System.currentTimeMillis();
+// System.out.println("加密使用时间:" + (b1 - a1));
+//
+// long a2 = System.currentTimeMillis();
+// decrypt(vectorKey, "D:\\workspace\\docus\\业务梳理\\测试数据\\伟淞\\98m-encrypt.pdf", "D:\\workspace\\docus\\业务梳理\\测试数据\\伟淞\\98m-decrypt.pdf");
+// long b2 = System.currentTimeMillis();
+// System.out.println("解密使用时间:" + (b2 - a2));
+ }
+}
diff --git a/src/main/java/com/docus/sw/word/ReadImgDoc.java b/src/main/java/com/docus/sw/word/ReadImgDoc.java
new file mode 100644
index 0000000..7c7fc9b
--- /dev/null
+++ b/src/main/java/com/docus/sw/word/ReadImgDoc.java
@@ -0,0 +1,43 @@
+package com.docus.sw.word;
+
+import java.io.*;
+import java.util.*;
+
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Picture;
+import org.apache.poi.hwpf.usermodel.Range;
+
+public class ReadImgDoc {
+
+ public static void main(String[] args) throws Exception {
+ new ReadImgDoc().readPicture("C:\\Users\\zhanghai\\Desktop\\桌面\\test\\a.doc");
+ }
+
+ private void readPicture(String path)throws Exception{
+ FileInputStream in=new FileInputStream(new File(path));
+ HWPFDocument doc=new HWPFDocument(in);
+ int length=doc.characterLength();
+ PicturesTable pTable=doc.getPicturesTable();
+ // int TitleLength=doc.getSummaryInformation().getTitle().length();
+
+ // System.out.println(TitleLength);
+ // System.out.println(length);
+ for (int i=0;i