From 7628e55f4359ae9bcd023bf851cdccabcfcc7afb Mon Sep 17 00:00:00 2001 From: zhanghai <120228220@qq.com> Date: Sat, 25 Nov 2023 12:38:08 +0800 Subject: [PATCH] =?UTF-8?q?=E7=BC=96=E5=86=99=E6=89=80=E6=9C=89=E5=9F=BA?= =?UTF-8?q?=E7=A1=80=E5=B7=A5=E5=85=B7=EF=BC=8C=E6=8F=90=E4=BE=9Bword=20ut?= =?UTF-8?q?il=20pdf=20util?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 28 ++- .../com/docus/sw/fenpan/FenpanService.java | 107 ++++++---- .../com/docus/sw/souyin/SuoyinService.java | 3 + .../java/com/docus/sw/word/GetPicsDocx.java | 35 ++++ .../java/com/docus/sw/word/PdfBoxUtils.java | 192 ++++++++++++++++++ .../java/com/docus/sw/word/ReadImgDoc.java | 43 ++++ 6 files changed, 368 insertions(+), 40 deletions(-) create mode 100644 src/main/java/com/docus/sw/word/GetPicsDocx.java create mode 100644 src/main/java/com/docus/sw/word/PdfBoxUtils.java create mode 100644 src/main/java/com/docus/sw/word/ReadImgDoc.java diff --git a/pom.xml b/pom.xml index 5210f91..db64daa 100644 --- a/pom.xml +++ b/pom.xml @@ -55,7 +55,7 @@ org.apache.pdfbox pdfbox - 2.0.23 + 2.0.24 bundle @@ -100,6 +100,32 @@ 1.4.0 + + org.apache.poi + poi + 5.2.2 + + + org.apache.poi + poi-ooxml + 5.2.2 + + + + + org.apache.poi + poi-scratchpad + 5.2.2 + + + + + net.coobird + thumbnailator + 0.4.20 + + + diff --git a/src/main/java/com/docus/sw/fenpan/FenpanService.java b/src/main/java/com/docus/sw/fenpan/FenpanService.java index e256993..bdaced5 100644 --- a/src/main/java/com/docus/sw/fenpan/FenpanService.java +++ b/src/main/java/com/docus/sw/fenpan/FenpanService.java @@ -2,6 +2,7 @@ package com.docus.sw.fenpan; import com.alibaba.excel.util.FileUtils; import com.docus.sw.Config; +import com.docus.sw.word.PdfBoxUtils; import lombok.extern.slf4j.Slf4j; import org.apache.commons.imaging.ImageInfo; import org.apache.commons.imaging.ImageReadException; @@ -86,6 +87,10 @@ public class FenpanService { new FenpanService().fenpan(saveUrl, readUrl); } + public List getDocument(Pieces piece){ + return null; + } + public Map readFile(String readUrl) { //读取文件夹。 @@ -99,54 +104,37 @@ public class FenpanService { //根据文件类型 if (piece.getFileTypeEnum() == FileTypeEnum.WORD) { //从word 直接提取图片 + //提取图片为document ,然后用于后面判断 } else if (piece.getFileTypeEnum() == FileTypeEnum.PDF) { - // 从pdf 提取图片 + // 从pdf 提取图片, + //提取图片为document ,然后用于后面判断 + File file = new File("temp"); + if(!file.exists()){ + file.mkdirs(); + } + + List documentList = new ArrayList<>(); + try { + PdfBoxUtils.pdf2image(piece.getAbsolutePath(),file.getAbsolutePath()); + File[] files = file.listFiles(); + for(File pdfImg:files){ + getDocumentList(documentList, pdfImg); + } + } catch (IOException e) { + log.error(e.getMessage(),e); + } + piece.put(documentList); + //删除对应的temp 文件 + file.delete(); } else { //是图片,直接从图片提取 List documentList = new ArrayList<>(); File sourceFile = new File(piece.getAbsolutePath()); File[] files = sourceFile.listFiles(); for (File file : files) { - //非图片模式,跳过。 - if (!(file.getName().endsWith(".jpg") || file.getName().endsWith(".png") - || file.getName().endsWith(".jpeg") || file.getName().endsWith(".tif") - || file.getName().endsWith(".tiff")) || file.getName().endsWith(".jp2") || file.getName().endsWith(".jpm")|| file.getName().endsWith(".gif")) { - continue; - } - - if (file.getName().endsWith(".jp2") || file.getName().endsWith(".jpm")) { - // 读取 JPEG 2000 图像文件 - - try { - BufferedImage image = ImageIO.read(file); - int height = image.getHeight(); - int width = image.getWidth(); - Document document = new Document(width, height, 300); - documentList.add(document); - } catch (IOException e) { - throw new RuntimeException(e); - } - - } else { - try { - ImageInfo imageInfo = Imaging.getImageInfo(file); - int height = imageInfo.getHeight(); - int width = imageInfo.getWidth(); - int physicalHeightDpi = imageInfo.getPhysicalHeightDpi(); - Document document = new Document(width, height, physicalHeightDpi); - documentList.add(document); - } catch (IOException e) { - FileUtils.delete(file); - throw new RuntimeException("非图片格式", e); - } catch (ImageReadException e) { - FileUtils.delete(file); - throw new RuntimeException(e); - } catch (IllegalArgumentException e) { - FileUtils.delete(file); - } - } + getDocumentList(documentList, file); } piece.put(documentList); } @@ -204,6 +192,47 @@ public class FenpanService { } + private static void getDocumentList(List documentList, File file) { + //非图片模式,跳过。 + if (!(file.getName().endsWith(".jpg") || file.getName().endsWith(".png") + || file.getName().endsWith(".jpeg") || file.getName().endsWith(".tif") + || file.getName().endsWith(".tiff")) || file.getName().endsWith(".jp2") || file.getName().endsWith(".jpm")|| file.getName().endsWith(".gif")) { + return; + } + + if (file.getName().endsWith(".jp2") || file.getName().endsWith(".jpm")) { + // 读取 JPEG 2000 图像文件 + + try { + BufferedImage image = ImageIO.read(file); + int height = image.getHeight(); + int width = image.getWidth(); + Document document = new Document(width, height, 300); + documentList.add(document); + } catch (IOException e) { + throw new RuntimeException(e); + } + + } else { + try { + ImageInfo imageInfo = Imaging.getImageInfo(file); + int height = imageInfo.getHeight(); + int width = imageInfo.getWidth(); + int physicalHeightDpi = imageInfo.getPhysicalHeightDpi(); + Document document = new Document(width, height, physicalHeightDpi); + documentList.add(document); + } catch (IOException e) { + FileUtils.delete(file); + throw new RuntimeException("非图片格式", e); + } catch (ImageReadException e) { + FileUtils.delete(file); + throw new RuntimeException(e); + } catch (IllegalArgumentException e) { + FileUtils.delete(file); + } + } + } + private void findAllDir(String absolutePath, List allDirectory) { File sourceFile = new File(absolutePath); diff --git a/src/main/java/com/docus/sw/souyin/SuoyinService.java b/src/main/java/com/docus/sw/souyin/SuoyinService.java index 539b418..240ac7b 100644 --- a/src/main/java/com/docus/sw/souyin/SuoyinService.java +++ b/src/main/java/com/docus/sw/souyin/SuoyinService.java @@ -70,6 +70,9 @@ public class SuoyinService { List pieceList = new ArrayList<>(); File[] piecesFile = rollFile.listFiles(); for(File piece:piecesFile){ + + //区分word和pdf + List documentList = new ArrayList<>(); Pieces pieces = new Pieces(FileTypeEnum.JPG, piece.getAbsolutePath(), piece.getName()); for (File docfile : piece.listFiles()) { diff --git a/src/main/java/com/docus/sw/word/GetPicsDocx.java b/src/main/java/com/docus/sw/word/GetPicsDocx.java new file mode 100644 index 0000000..10e051b --- /dev/null +++ b/src/main/java/com/docus/sw/word/GetPicsDocx.java @@ -0,0 +1,35 @@ +package com.docus.sw.word; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.List; + +import org.apache.poi.xwpf.extractor.XWPFWordExtractor; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFPictureData; + +public class GetPicsDocx { + public static void main(String[] args) { + String path = "E:\\上海项目测试\\文档\\35.docx"; + File file = new File(path); + try { + FileInputStream fis = new FileInputStream(file); + XWPFDocument document = new XWPFDocument(fis); + XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(document); + String text = xwpfWordExtractor.getText(); + System.out.println(text); + List picList = document.getAllPictures(); + for (XWPFPictureData pic : picList) { + System.out.println(pic.getPictureType() + File.separator + pic.suggestFileExtension() + File.separator + pic.getFileName()); + byte[] bytev = pic.getData(); + FileOutputStream fos = new FileOutputStream("E:\\上海项目测试\\docxImage\\" + pic.getFileName()); + fos.write(bytev); + } + fis.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } +} \ No newline at end of file diff --git a/src/main/java/com/docus/sw/word/PdfBoxUtils.java b/src/main/java/com/docus/sw/word/PdfBoxUtils.java new file mode 100644 index 0000000..6df3485 --- /dev/null +++ b/src/main/java/com/docus/sw/word/PdfBoxUtils.java @@ -0,0 +1,192 @@ +package com.docus.sw.word; + +import net.coobird.thumbnailator.Thumbnails; +import net.coobird.thumbnailator.filters.Canvas; +import net.coobird.thumbnailator.geometry.Positions; +import org.apache.pdfbox.contentstream.PDFStreamEngine; +import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageTree; +import org.apache.pdfbox.pdmodel.encryption.AccessPermission; +import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; +import org.apache.pdfbox.pdmodel.encryption.PDEncryption; +import org.apache.pdfbox.pdmodel.encryption.SecurityHandler; +import org.apache.pdfbox.pdmodel.encryption.StandardProtectionPolicy; +import org.apache.pdfbox.pdmodel.encryption.StandardSecurityHandler; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; + +import javax.imageio.ImageIO; +import java.awt.*; +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; +import java.util.List; + +/** + * pdfbox 加解密 + */ +public class PdfBoxUtils { + + private static final String vectorKey = "AesVKeyDocus@702"; + + /** + * 加密 + * + * @param key + * @param sourceFilePath + * @param destFilePath + * @throws Exception + */ + public static void encrypt(String key, String sourceFilePath, String destFilePath) throws Exception { + + File file = new File(sourceFilePath); + PDDocument load = PDDocument.load(file); + + AccessPermission permissions = new AccessPermission(); + // 是否可以插入/删除/旋转页面 + permissions.setCanAssembleDocument(false); + // 是否可以复制和提取内容 + permissions.setCanExtractContent(false); + + permissions.setCanExtractForAccessibility(false); + // 设置用户是否可以填写交互式表单字段(包括签名字段) + permissions.setCanFillInForm(false); + // 设置用户是否可以修改文档 + permissions.setCanModify(false); + // 设置用户是否可以添加或修改文本注释并填写交互式表单字段,如果canModify()返回true,则创建或修改交互式表单字段(包括签名字段)。 + permissions.setCanModifyAnnotations(false); + // 设置用户是否可以打印。 + permissions.setCanPrint(false); + // 设置用户是否可以降级格式打印文档 + permissions.setCanPrintDegraded(false); + + StandardProtectionPolicy p = new StandardProtectionPolicy(key, key, permissions); + SecurityHandler sh = new StandardSecurityHandler(p); + sh.prepareDocumentForEncryption(load); + PDEncryption encryptionOptions = new PDEncryption(); + encryptionOptions.setSecurityHandler(sh); + + load.setEncryptionDictionary(encryptionOptions); + load.save(destFilePath); + load.close(); + } + + /** + * 解密 + * + * @param key + * @param sourceFilePath + * @param destFilePath + * @throws InvalidPasswordException + * @throws IOException + */ + public static void decrypt(String key, String sourceFilePath, String destFilePath) throws InvalidPasswordException, IOException { + File file = new File(sourceFilePath); + PDDocument load = PDDocument.load(file, key); + load.setAllSecurityToBeRemoved(true); + load.save(destFilePath); + load.close(); + } + + + /** + * pdfbox pdf to image + */ + public static class DefPdfToImageEngine extends PDFStreamEngine { + public int imageNum = 1; + + private final String des; + private final String name; + + public DefPdfToImageEngine(String name, String des) { + this.des = des; + this.name = name; + } + + @Override + protected void processOperator(Operator operator, List operands) throws IOException { + String operatorName = operator.getName(); + if ("Do".equals(operatorName)) { + COSName objName = (COSName) operands.get(0); + PDXObject pdxObject = this.getResources().getXObject(objName); + if (pdxObject instanceof PDImageXObject) { + PDImageXObject imageXObject = (PDImageXObject) pdxObject; + BufferedImage image = imageXObject.getImage(); + ImageIO.write(image,"jpg",new File(this.des + this.name + "-" + this.imageNum + ".jpg")); + ++this.imageNum; + } else if (pdxObject instanceof PDFormXObject) { + PDFormXObject form = (PDFormXObject) pdxObject; + this.showForm(form); + } + } else { + super.processOperator(operator, operands); + } + + } + } + + /** + * pdf to image + * + * @param src + * @param des + * @throws IOException + */ + public static void pdf2image(String src, String des) throws IOException { + File pdfFile = new File(src); + PDDocument load = PDDocument.load(pdfFile, MemoryUsageSetting.setupMixed(1024 * 1024)); + DefPdfToImageEngine imageEngine = new DefPdfToImageEngine(pdfFile.getName().substring(0, pdfFile.getName().lastIndexOf(".")), des); + for (PDPage page : load.getPages()) { + imageEngine.processPage(page); + } + load.close(); + } + + + /** + * 指定提取哪一页 + * + * @param src + * @param des + * @param pageNum + * @throws IOException + */ + public static void pdf2image(String src, String des, int pageNum) throws IOException { + File pdfFile = new File(src); + PDDocument load = PDDocument.load(pdfFile, MemoryUsageSetting.setupMixed(1024 * 1024)); + DefPdfToImageEngine imageEngine = new DefPdfToImageEngine(pdfFile.getName().substring(0, pdfFile.getName().lastIndexOf(".")), des); + + PDPageTree pages = load.getPages(); + + for (int i = 0; i < pages.getCount(); i++) { + + if (pageNum == i + 1) { + imageEngine.processPage(pages.get(i)); + break; + } + } + + load.close(); + } + + public static void main(String[] args) throws Exception { + pdf2image("C:\\Users\\zhanghai\\Desktop\\桌面\\test\\a.pdf", "C:\\Users\\zhanghai\\Desktop\\桌面\\test\\"); + + +// long a1 = System.currentTimeMillis(); +// encrypt(vectorKey, "D:\\workspace\\docus\\业务梳理\\测试数据\\伟淞\\98m.pdf", "D:\\workspace\\docus\\业务梳理\\测试数据\\伟淞\\98m-encrypt.pdf"); +// long b1 = System.currentTimeMillis(); +// System.out.println("加密使用时间:" + (b1 - a1)); +// +// long a2 = System.currentTimeMillis(); +// decrypt(vectorKey, "D:\\workspace\\docus\\业务梳理\\测试数据\\伟淞\\98m-encrypt.pdf", "D:\\workspace\\docus\\业务梳理\\测试数据\\伟淞\\98m-decrypt.pdf"); +// long b2 = System.currentTimeMillis(); +// System.out.println("解密使用时间:" + (b2 - a2)); + } +} diff --git a/src/main/java/com/docus/sw/word/ReadImgDoc.java b/src/main/java/com/docus/sw/word/ReadImgDoc.java new file mode 100644 index 0000000..7c7fc9b --- /dev/null +++ b/src/main/java/com/docus/sw/word/ReadImgDoc.java @@ -0,0 +1,43 @@ +package com.docus.sw.word; + +import java.io.*; +import java.util.*; + + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.model.PicturesTable; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Picture; +import org.apache.poi.hwpf.usermodel.Range; + +public class ReadImgDoc { + + public static void main(String[] args) throws Exception { + new ReadImgDoc().readPicture("C:\\Users\\zhanghai\\Desktop\\桌面\\test\\a.doc"); + } + + private void readPicture(String path)throws Exception{ + FileInputStream in=new FileInputStream(new File(path)); + HWPFDocument doc=new HWPFDocument(in); + int length=doc.characterLength(); + PicturesTable pTable=doc.getPicturesTable(); + // int TitleLength=doc.getSummaryInformation().getTitle().length(); + + // System.out.println(TitleLength); + // System.out.println(length); + for (int i=0;i