编写所有基础工具,提供word util pdf util

master
zhanghai 2 years ago
parent 7cd20468ec
commit 7628e55f43

@ -55,7 +55,7 @@
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.23</version>
<version>2.0.24</version>
<type>bundle</type>
</dependency>
<dependency>
@ -100,6 +100,32 @@
<version>1.4.0</version> <!-- 请检查最新版本 -->
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.2</version> <!-- 请根据你的实际情况选择版本 -->
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.2</version> <!-- 请根据你的实际情况选择版本 -->
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.coobird/thumbnailator -->
<dependency>
<groupId>net.coobird</groupId>
<artifactId>thumbnailator</artifactId>
<version>0.4.20</version>
</dependency>
</dependencies>

@ -2,6 +2,7 @@ package com.docus.sw.fenpan;
import com.alibaba.excel.util.FileUtils;
import com.docus.sw.Config;
import com.docus.sw.word.PdfBoxUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.imaging.ImageInfo;
import org.apache.commons.imaging.ImageReadException;
@ -86,6 +87,10 @@ public class FenpanService {
new FenpanService().fenpan(saveUrl, readUrl);
}
public List<Document> getDocument(Pieces piece){
return null;
}
public Map<String, Zong> readFile(String readUrl) {
//读取文件夹。
@ -99,54 +104,37 @@ public class FenpanService {
//根据文件类型
if (piece.getFileTypeEnum() == FileTypeEnum.WORD) {
//从word 直接提取图片
//提取图片为document ,然后用于后面判断
} else if (piece.getFileTypeEnum() == FileTypeEnum.PDF) {
// 从pdf 提取图片
// 从pdf 提取图片,
//提取图片为document ,然后用于后面判断
File file = new File("temp");
if(!file.exists()){
file.mkdirs();
}
List<Document> documentList = new ArrayList<>();
try {
PdfBoxUtils.pdf2image(piece.getAbsolutePath(),file.getAbsolutePath());
File[] files = file.listFiles();
for(File pdfImg:files){
getDocumentList(documentList, pdfImg);
}
} catch (IOException e) {
log.error(e.getMessage(),e);
}
piece.put(documentList);
//删除对应的temp 文件
file.delete();
} else {
//是图片,直接从图片提取
List<Document> documentList = new ArrayList<>();
File sourceFile = new File(piece.getAbsolutePath());
File[] files = sourceFile.listFiles();
for (File file : files) {
//非图片模式,跳过。
if (!(file.getName().endsWith(".jpg") || file.getName().endsWith(".png")
|| file.getName().endsWith(".jpeg") || file.getName().endsWith(".tif")
|| file.getName().endsWith(".tiff")) || file.getName().endsWith(".jp2") || file.getName().endsWith(".jpm")|| file.getName().endsWith(".gif")) {
continue;
}
if (file.getName().endsWith(".jp2") || file.getName().endsWith(".jpm")) {
// 读取 JPEG 2000 图像文件
try {
BufferedImage image = ImageIO.read(file);
int height = image.getHeight();
int width = image.getWidth();
Document document = new Document(width, height, 300);
documentList.add(document);
} catch (IOException e) {
throw new RuntimeException(e);
}
} else {
try {
ImageInfo imageInfo = Imaging.getImageInfo(file);
int height = imageInfo.getHeight();
int width = imageInfo.getWidth();
int physicalHeightDpi = imageInfo.getPhysicalHeightDpi();
Document document = new Document(width, height, physicalHeightDpi);
documentList.add(document);
} catch (IOException e) {
FileUtils.delete(file);
throw new RuntimeException("非图片格式", e);
} catch (ImageReadException e) {
FileUtils.delete(file);
throw new RuntimeException(e);
} catch (IllegalArgumentException e) {
FileUtils.delete(file);
}
}
getDocumentList(documentList, file);
}
piece.put(documentList);
}
@ -204,6 +192,47 @@ public class FenpanService {
}
private static void getDocumentList(List<Document> documentList, File file) {
//非图片模式,跳过。
if (!(file.getName().endsWith(".jpg") || file.getName().endsWith(".png")
|| file.getName().endsWith(".jpeg") || file.getName().endsWith(".tif")
|| file.getName().endsWith(".tiff")) || file.getName().endsWith(".jp2") || file.getName().endsWith(".jpm")|| file.getName().endsWith(".gif")) {
return;
}
if (file.getName().endsWith(".jp2") || file.getName().endsWith(".jpm")) {
// 读取 JPEG 2000 图像文件
try {
BufferedImage image = ImageIO.read(file);
int height = image.getHeight();
int width = image.getWidth();
Document document = new Document(width, height, 300);
documentList.add(document);
} catch (IOException e) {
throw new RuntimeException(e);
}
} else {
try {
ImageInfo imageInfo = Imaging.getImageInfo(file);
int height = imageInfo.getHeight();
int width = imageInfo.getWidth();
int physicalHeightDpi = imageInfo.getPhysicalHeightDpi();
Document document = new Document(width, height, physicalHeightDpi);
documentList.add(document);
} catch (IOException e) {
FileUtils.delete(file);
throw new RuntimeException("非图片格式", e);
} catch (ImageReadException e) {
FileUtils.delete(file);
throw new RuntimeException(e);
} catch (IllegalArgumentException e) {
FileUtils.delete(file);
}
}
}
private void findAllDir(String absolutePath, List<Pieces> allDirectory) {
File sourceFile = new File(absolutePath);

@ -70,6 +70,9 @@ public class SuoyinService {
List<Pieces> pieceList = new ArrayList<>();
File[] piecesFile = rollFile.listFiles();
for(File piece:piecesFile){
//区分word和pdf
List<Document> documentList = new ArrayList<>();
Pieces pieces = new Pieces(FileTypeEnum.JPG, piece.getAbsolutePath(), piece.getName());
for (File docfile : piece.listFiles()) {

@ -0,0 +1,35 @@
package com.docus.sw.word;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
public class GetPicsDocx {
public static void main(String[] args) {
String path = "E:\\上海项目测试\\文档\\35.docx";
File file = new File(path);
try {
FileInputStream fis = new FileInputStream(file);
XWPFDocument document = new XWPFDocument(fis);
XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(document);
String text = xwpfWordExtractor.getText();
System.out.println(text);
List<XWPFPictureData> picList = document.getAllPictures();
for (XWPFPictureData pic : picList) {
System.out.println(pic.getPictureType() + File.separator + pic.suggestFileExtension() + File.separator + pic.getFileName());
byte[] bytev = pic.getData();
FileOutputStream fos = new FileOutputStream("E:\\上海项目测试\\docxImage\\" + pic.getFileName());
fos.write(bytev);
}
fis.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

@ -0,0 +1,192 @@
package com.docus.sw.word;
import net.coobird.thumbnailator.Thumbnails;
import net.coobird.thumbnailator.filters.Canvas;
import net.coobird.thumbnailator.geometry.Positions;
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.encryption.PDEncryption;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
import org.apache.pdfbox.pdmodel.encryption.StandardProtectionPolicy;
import org.apache.pdfbox.pdmodel.encryption.StandardSecurityHandler;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.List;
/**
* pdfbox
*/
public class PdfBoxUtils {
private static final String vectorKey = "AesVKeyDocus@702";
/**
*
*
* @param key
* @param sourceFilePath
* @param destFilePath
* @throws Exception
*/
public static void encrypt(String key, String sourceFilePath, String destFilePath) throws Exception {
File file = new File(sourceFilePath);
PDDocument load = PDDocument.load(file);
AccessPermission permissions = new AccessPermission();
// 是否可以插入/删除/旋转页面
permissions.setCanAssembleDocument(false);
// 是否可以复制和提取内容
permissions.setCanExtractContent(false);
permissions.setCanExtractForAccessibility(false);
// 设置用户是否可以填写交互式表单字段(包括签名字段)
permissions.setCanFillInForm(false);
// 设置用户是否可以修改文档
permissions.setCanModify(false);
// 设置用户是否可以添加或修改文本注释并填写交互式表单字段如果canModify()返回true则创建或修改交互式表单字段包括签名字段
permissions.setCanModifyAnnotations(false);
// 设置用户是否可以打印。
permissions.setCanPrint(false);
// 设置用户是否可以降级格式打印文档
permissions.setCanPrintDegraded(false);
StandardProtectionPolicy p = new StandardProtectionPolicy(key, key, permissions);
SecurityHandler sh = new StandardSecurityHandler(p);
sh.prepareDocumentForEncryption(load);
PDEncryption encryptionOptions = new PDEncryption();
encryptionOptions.setSecurityHandler(sh);
load.setEncryptionDictionary(encryptionOptions);
load.save(destFilePath);
load.close();
}
/**
*
*
* @param key
* @param sourceFilePath
* @param destFilePath
* @throws InvalidPasswordException
* @throws IOException
*/
public static void decrypt(String key, String sourceFilePath, String destFilePath) throws InvalidPasswordException, IOException {
File file = new File(sourceFilePath);
PDDocument load = PDDocument.load(file, key);
load.setAllSecurityToBeRemoved(true);
load.save(destFilePath);
load.close();
}
/**
* pdfbox pdf to image
*/
public static class DefPdfToImageEngine extends PDFStreamEngine {
public int imageNum = 1;
private final String des;
private final String name;
public DefPdfToImageEngine(String name, String des) {
this.des = des;
this.name = name;
}
@Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException {
String operatorName = operator.getName();
if ("Do".equals(operatorName)) {
COSName objName = (COSName) operands.get(0);
PDXObject pdxObject = this.getResources().getXObject(objName);
if (pdxObject instanceof PDImageXObject) {
PDImageXObject imageXObject = (PDImageXObject) pdxObject;
BufferedImage image = imageXObject.getImage();
ImageIO.write(image,"jpg",new File(this.des + this.name + "-" + this.imageNum + ".jpg"));
++this.imageNum;
} else if (pdxObject instanceof PDFormXObject) {
PDFormXObject form = (PDFormXObject) pdxObject;
this.showForm(form);
}
} else {
super.processOperator(operator, operands);
}
}
}
/**
* pdf to image
*
* @param src
* @param des
* @throws IOException
*/
public static void pdf2image(String src, String des) throws IOException {
File pdfFile = new File(src);
PDDocument load = PDDocument.load(pdfFile, MemoryUsageSetting.setupMixed(1024 * 1024));
DefPdfToImageEngine imageEngine = new DefPdfToImageEngine(pdfFile.getName().substring(0, pdfFile.getName().lastIndexOf(".")), des);
for (PDPage page : load.getPages()) {
imageEngine.processPage(page);
}
load.close();
}
/**
*
*
* @param src
* @param des
* @param pageNum
* @throws IOException
*/
public static void pdf2image(String src, String des, int pageNum) throws IOException {
File pdfFile = new File(src);
PDDocument load = PDDocument.load(pdfFile, MemoryUsageSetting.setupMixed(1024 * 1024));
DefPdfToImageEngine imageEngine = new DefPdfToImageEngine(pdfFile.getName().substring(0, pdfFile.getName().lastIndexOf(".")), des);
PDPageTree pages = load.getPages();
for (int i = 0; i < pages.getCount(); i++) {
if (pageNum == i + 1) {
imageEngine.processPage(pages.get(i));
break;
}
}
load.close();
}
public static void main(String[] args) throws Exception {
pdf2image("C:\\Users\\zhanghai\\Desktop\\桌面\\test\\a.pdf", "C:\\Users\\zhanghai\\Desktop\\桌面\\test\\");
// long a1 = System.currentTimeMillis();
// encrypt(vectorKey, "D:\\workspace\\docus\\业务梳理\\测试数据\\伟淞\\98m.pdf", "D:\\workspace\\docus\\业务梳理\\测试数据\\伟淞\\98m-encrypt.pdf");
// long b1 = System.currentTimeMillis();
// System.out.println("加密使用时间:" + (b1 - a1));
//
// long a2 = System.currentTimeMillis();
// decrypt(vectorKey, "D:\\workspace\\docus\\业务梳理\\测试数据\\伟淞\\98m-encrypt.pdf", "D:\\workspace\\docus\\业务梳理\\测试数据\\伟淞\\98m-decrypt.pdf");
// long b2 = System.currentTimeMillis();
// System.out.println("解密使用时间:" + (b2 - a2));
}
}

@ -0,0 +1,43 @@
package com.docus.sw.word;
import java.io.*;
import java.util.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
public class ReadImgDoc {
public static void main(String[] args) throws Exception {
new ReadImgDoc().readPicture("C:\\Users\\zhanghai\\Desktop\\桌面\\test\\a.doc");
}
private void readPicture(String path)throws Exception{
FileInputStream in=new FileInputStream(new File(path));
HWPFDocument doc=new HWPFDocument(in);
int length=doc.characterLength();
PicturesTable pTable=doc.getPicturesTable();
// int TitleLength=doc.getSummaryInformation().getTitle().length();
// System.out.println(TitleLength);
// System.out.println(length);
for (int i=0;i<length;i++){
Range range=new Range(i, i+1,doc);
CharacterRun cr=range.getCharacterRun(0);
if(pTable.hasPicture(cr)){
Picture pic=pTable.extractPicture(cr, false);
String afileName=pic.suggestFullFileName();
OutputStream out=new FileOutputStream(new File("C:\\Users\\zhanghai\\Desktop\\桌面\\test\\"+UUID.randomUUID()+afileName));
pic.writeImageContent(out);
}
}
}
}
Loading…
Cancel
Save