背景:
项目需求: 校对文章中出现的错敏词,三方系统会给出有问题的文字的下标,我们需要在原文档(docx)中替换有问题的词,将其改为建议修改的词。
思路:
我们的思路是先遍历所有的错敏词,然后在 Word 文档的段落和表格对象中进行扫描。我们会计算每个错敏词在文档中出现的次数。如果某个错敏词只出现了一次,那么我们可以直接进行全局替换。但如果某个错敏词出现了两次,这意味着相同的错敏词在不同的语境下可能并不都是错误的,因此我们不能直接进行全局替换,而是需要根据下标进行处理。
poi Maven 依赖
<!-- POI 相关 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.0.0</version>
</dependency>
具体算法实现
public static void replaceText(XWPFDocument doc, List<Replacement> replacements) {
for (int z = replacements.size(); z > 0 ; z--) {
Replacement replacement = replacements.get(z-1);
String targetWord = replacement.getTargetWord();
String replacementWord = replacement.getReplacementWord();
int startIndex = replacement.getStartIndex();
int matchCount = 0;
int currentIndex = 0;
// 首先,计算 matchCount 并记录每次匹配的开始位置
List<Integer> matchIndices = new ArrayList<>();
List<IBodyElement> bodyElements = doc.getBodyElements();
for (IBodyElement bodyElement : bodyElements) {
if (bodyElement instanceof XWPFParagraph) {
XWPFParagraph paragraph = (XWPFParagraph) bodyElement;
List<XWPFRun> runs = paragraph.getRuns();
for (XWPFRun run : runs) {
String text = run.getText(0);
if (text != null) {
int indexInRun = text.indexOf(targetWord);
while (indexInRun != -1) {
matchCount++;
matchIndices.add(currentIndex + indexInRun);
indexInRun = text.indexOf(targetWord, indexInRun + 1);
}
currentIndex += text.length();
}
}
} else if (bodyElement instanceof XWPFTable) {
XWPFTable table = (XWPFTable) bodyElement;
List<XWPFTableRow> rows = table.getRows();
for (XWPFTableRow row : rows) {
List<XWPFTableCell> cells = row.getTableCells();
for (XWPFTableCell cell : cells) {
String text = cell.getText();
if (text != null) {
int indexInCell = text.indexOf(targetWord);
while (indexInCell != -1) {
matchCount++;
matchIndices.add(currentIndex + indexInCell);
indexInCell = text.indexOf(targetWord, indexInCell + 1);
}
currentIndex += text.length();
}
}
}
}
}
// 然后,如果 matchCount > 1,根据 startIndex 替换目标词
if (matchCount > 1) {
currentIndex = 0;
int replaceIndex = -1;
for (int i = 0; i < matchIndices.size(); i++) {
if (matchIndices.get(i) >= startIndex) {
replaceIndex = matchIndices.get(i);
break;
}
}
if (replaceIndex == -1) {
log.info("未找到从给定的 startIndex 开始的目标词 \"" + targetWord + "\"。");
continue;
}
for (IBodyElement bodyElement : bodyElements) {
if (bodyElement instanceof XWPFParagraph) {
XWPFParagraph paragraph = (XWPFParagraph) bodyElement;
List<XWPFRun> runs = paragraph.getRuns();
for (int i = runs.size()-1; i > -1 ; i--) {
XWPFRun xwpfRun = runs.get(i);
String text = xwpfRun.getText(0);
if (text != null) {
if (currentIndex + text.length() > replaceIndex) {
int indexInRun = text.indexOf(targetWord);
if(indexInRun == -1){
continue;
}
log.info("下标数据:{},{},{}",indexInRun,replacementWord,targetWord);
text = text.substring(0, indexInRun) + replacementWord + text.substring(indexInRun + targetWord.length());
xwpfRun.setText(text, 0);
log.info("目标词-段落 \"" + targetWord + "\" 已被替换为 \"" + replacementWord + "\"。");
// return;
}
currentIndex += text.length();
}
}
} else if (bodyElement instanceof XWPFTable) {
XWPFTable table = (XWPFTable) bodyElement;
List<XWPFTableRow> rows = table.getRows();
for (XWPFTableRow row : rows) {
List<XWPFTableCell> cells = row.getTableCells();
for (int i = cells.size()-1; i > -1 ; i--) {
XWPFTableCell cell = cells.get(i);
String text = cell.getText();
if (text != null) {
if (currentIndex + text.length() > replaceIndex) {
text = text.replaceFirst(targetWord, replacementWord);
cell.removeParagraph(0);
cell.setText(text);
log.info("目标词表格 \"" + targetWord + "\" 已被替换为 \"" + replacementWord + "\"。");
// return;
}
currentIndex += text.length();
}
}
}
}
}
} else if (matchCount == 1) {
// 如果只有一个匹配,直接替换
for (IBodyElement bodyElement : bodyElements) {
if (bodyElement instanceof XWPFParagraph) {
XWPFParagraph paragraph = (XWPFParagraph) bodyElement;
List<XWPFRun> runs = paragraph.getRuns();
for (XWPFRun run : runs) {
String text = run.getText(0);
if (text != null && text.contains(targetWord)) {
text = text.replace(targetWord, replacementWord);
run.setText(text, 0);
log.info("目标词-段落 \"" + targetWord + "\" 已被替换为 \"" + replacementWord + "\"。");
// return;
}
}
} else if (bodyElement instanceof XWPFTable) {
XWPFTable table = (XWPFTable) bodyElement;
List<XWPFTableRow> rows = table.getRows();
for (XWPFTableRow row : rows) {
List<XWPFTableCell> cells = row.getTableCells();
for (XWPFTableCell cell : cells) {
String text = cell.getText();
if (text != null && text.contains(targetWord)) {
text = text.replace(targetWord, replacementWord);
cell.removeParagraph(0);
cell.setText(text);
log.info("目标词-表格 \"" + targetWord + "\" 已被替换为 \"" + replacementWord + "\"。");
// return;
}
}
}
}
}
} else {
log.info("未找到目标词 \"" + targetWord + "\"。");
}
}
}
public static class Replacement {
private String targetWord;
private String replacementWord;
private int startIndex;
public Replacement(String targetWord, String replacementWord, int startIndex) {
this.targetWord = targetWord;
this.replacementWord = replacementWord;
this.startIndex = startIndex;
}
public String getTargetWord() {
return targetWord;
}
public String getReplacementWord() {
return replacementWord;
}
public int getStartIndex() {
return startIndex;
}
}
调用
@SneakyThrows
public static void main(String[] args) {
// 读取Word文档
String inputFilePath = "/Users/zhangburui/Downloads/zpr_test.doc";
String outputFilePath = "/Users/zhangburui/Downloads/abc_modified.doc";
try {
// 读取Word文件
// XWPFDocument doc = new XWPFDocument(new FileInputStream(inputFilePath));
HWPFDocument doc = new HWPFDocument(new FileInputStream(inputFilePath));
List<Replacement> replacements = new ArrayList<>();
replacements.add(new Replacement("人民法院法院", "replacement2", 59541));
replacements.add(new Replacement("人民法院法院", "replacement1", 23218));
// 调用替换方法
replaceTextDoc(doc, replacements);
// 保存修改后的Word文件
FileOutputStream outputStream = new FileOutputStream(outputFilePath);
doc.write(outputStream);
outputStream.close();
System.out.println("替换完成,并成功写回Word文件。");
} catch (IOException e) {
e.printStackTrace();
}
}
评论区