From f5a56078efb139703fcf4330b6ead1bbd896027b Mon Sep 17 00:00:00 2001 From: zhanghai <120228220@qq.com> Date: Sat, 25 Nov 2023 14:26:19 +0800 Subject: [PATCH] =?UTF-8?q?=E7=BC=96=E5=86=99=E6=89=80=E6=9C=89=E5=9F=BA?= =?UTF-8?q?=E7=A1=80=E5=B7=A5=E5=85=B7=EF=BC=8C=E6=8F=90=E4=BE=9Bword=20ut?= =?UTF-8?q?il=20pdf=20util?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/docus/sw/fenpan/FenpanService.java | 52 +++++++++++++++- .../com/docus/sw/fenpan/FileTypeEnum.java | 3 +- .../java/com/docus/sw/word/GetPicsDocx.java | 39 ++++++++++-- .../java/com/docus/sw/word/ReadImgDoc.java | 60 +++++++++++++------ 4 files changed, 129 insertions(+), 25 deletions(-) diff --git a/src/main/java/com/docus/sw/fenpan/FenpanService.java b/src/main/java/com/docus/sw/fenpan/FenpanService.java index bdaced5..eae64a8 100644 --- a/src/main/java/com/docus/sw/fenpan/FenpanService.java +++ b/src/main/java/com/docus/sw/fenpan/FenpanService.java @@ -2,6 +2,7 @@ package com.docus.sw.fenpan; import com.alibaba.excel.util.FileUtils; import com.docus.sw.Config; +import com.docus.sw.word.GetPicsDocx; import com.docus.sw.word.PdfBoxUtils; import lombok.extern.slf4j.Slf4j; import org.apache.commons.imaging.ImageInfo; @@ -102,10 +103,52 @@ public class FenpanService { for (Pieces piece : pieces) { //根据文件类型 - if (piece.getFileTypeEnum() == FileTypeEnum.WORD) { + if (piece.getFileTypeEnum() == FileTypeEnum.DOC) { //从word 直接提取图片 //提取图片为document ,然后用于后面判断 + File file = new File("temp"); + if(!file.exists()){ + file.mkdirs(); + } + + GetPicsDocx.getPics(piece.getAbsolutePath(),file.getAbsolutePath()); + List documentList = new ArrayList<>(); + try { + PdfBoxUtils.pdf2image(piece.getAbsolutePath(),file.getAbsolutePath()); + File[] files = file.listFiles(); + for(File pdfImg:files){ + getDocumentList(documentList, pdfImg); + } + } catch (IOException e) { + log.error(e.getMessage(),e); + } + + piece.put(documentList); + //删除对应的temp 文件 + file.delete(); + } else if (piece.getFileTypeEnum() == FileTypeEnum.DOCX) { + //从word 直接提取图片 + //提取图片为document ,然后用于后面判断 + File file = new File("temp"); + if(!file.exists()){ + file.mkdirs(); + } + + GetPicsDocx.getPics(piece.getAbsolutePath(),file.getAbsolutePath()); + List documentList = new ArrayList<>(); + try { + PdfBoxUtils.pdf2image(piece.getAbsolutePath(),file.getAbsolutePath()); + File[] files = file.listFiles(); + for(File pdfImg:files){ + getDocumentList(documentList, pdfImg); + } + } catch (IOException e) { + log.error(e.getMessage(),e); + } + piece.put(documentList); + //删除对应的temp 文件 + file.delete(); } else if (piece.getFileTypeEnum() == FileTypeEnum.PDF) { // 从pdf 提取图片, //提取图片为document ,然后用于后面判断 @@ -247,8 +290,11 @@ public class FenpanService { if (o.getName().endsWith(".pdf")) { Pieces pieces = new Pieces(FileTypeEnum.PDF, o.getAbsolutePath(), o.getName()); allDirectory.add(pieces); - } else if (o.getName().endsWith(".docx") || o.getName().endsWith(".doc")) { - Pieces pieces = new Pieces(FileTypeEnum.WORD, o.getAbsolutePath(), o.getName()); + } else if (o.getName().endsWith(".docx") ) { + Pieces pieces = new Pieces(FileTypeEnum.DOCX, o.getAbsolutePath(), o.getName()); + allDirectory.add(pieces); + } else if ( o.getName().endsWith(".doc")) { + Pieces pieces = new Pieces(FileTypeEnum.DOC, o.getAbsolutePath(), o.getName()); allDirectory.add(pieces); } else if (o.getName().endsWith(".jpg") || o.getName().endsWith(".png") || o.getName().endsWith(".jpeg") || o.getName().endsWith(".tif") diff --git a/src/main/java/com/docus/sw/fenpan/FileTypeEnum.java b/src/main/java/com/docus/sw/fenpan/FileTypeEnum.java index 8c8a59d..0d24603 100644 --- a/src/main/java/com/docus/sw/fenpan/FileTypeEnum.java +++ b/src/main/java/com/docus/sw/fenpan/FileTypeEnum.java @@ -1,7 +1,8 @@ package com.docus.sw.fenpan; public enum FileTypeEnum { - WORD, + DOC, + DOCX, PDF, JPG } diff --git a/src/main/java/com/docus/sw/word/GetPicsDocx.java b/src/main/java/com/docus/sw/word/GetPicsDocx.java index 10e051b..6dd579f 100644 --- a/src/main/java/com/docus/sw/word/GetPicsDocx.java +++ b/src/main/java/com/docus/sw/word/GetPicsDocx.java @@ -1,15 +1,15 @@ package com.docus.sw.word; +import org.apache.poi.xwpf.extractor.XWPFWordExtractor; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFPictureData; + import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.util.List; -import org.apache.poi.xwpf.extractor.XWPFWordExtractor; -import org.apache.poi.xwpf.usermodel.XWPFDocument; -import org.apache.poi.xwpf.usermodel.XWPFPictureData; - public class GetPicsDocx { public static void main(String[] args) { String path = "E:\\上海项目测试\\文档\\35.docx"; @@ -32,4 +32,35 @@ public class GetPicsDocx { e.printStackTrace(); } } + + + public static void getPics(String fromPath, String toPath) { + File file = new File(fromPath); + FileInputStream fis = null; + try { + fis = new FileInputStream(file); + XWPFDocument document = new XWPFDocument(fis); + XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(document); + String text = xwpfWordExtractor.getText(); + System.out.println(text); + List picList = document.getAllPictures(); + int i = 1; + for (XWPFPictureData pic : picList) { + byte[] bytev = pic.getData(); + FileOutputStream fos = new FileOutputStream(toPath +i+ pic.getFileName()); + fos.write(bytev); + } + + } catch (IOException e) { + e.printStackTrace(); + } finally { + if (fis != null) { + try { + fis.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } } \ No newline at end of file diff --git a/src/main/java/com/docus/sw/word/ReadImgDoc.java b/src/main/java/com/docus/sw/word/ReadImgDoc.java index 7c7fc9b..e5e9f32 100644 --- a/src/main/java/com/docus/sw/word/ReadImgDoc.java +++ b/src/main/java/com/docus/sw/word/ReadImgDoc.java @@ -1,38 +1,40 @@ package com.docus.sw.word; -import java.io.*; -import java.util.*; - - import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.model.PicturesTable; import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Range; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.OutputStream; +import java.util.UUID; + public class ReadImgDoc { public static void main(String[] args) throws Exception { new ReadImgDoc().readPicture("C:\\Users\\zhanghai\\Desktop\\桌面\\test\\a.doc"); } - private void readPicture(String path)throws Exception{ - FileInputStream in=new FileInputStream(new File(path)); - HWPFDocument doc=new HWPFDocument(in); - int length=doc.characterLength(); - PicturesTable pTable=doc.getPicturesTable(); + private void readPicture(String path) throws Exception { + FileInputStream in = new FileInputStream(new File(path)); + HWPFDocument doc = new HWPFDocument(in); + int length = doc.characterLength(); + PicturesTable pTable = doc.getPicturesTable(); // int TitleLength=doc.getSummaryInformation().getTitle().length(); // System.out.println(TitleLength); // System.out.println(length); - for (int i=0;i