目 录CONTENT

文章目录

基于poi实现指定下标文字替换算法实现

小张的探险日记
2024-01-19 / 0 评论 / 0 点赞 / 808 阅读 / 6,200 字 / 正在检测是否收录...
温馨提示:
本文最后更新于 2024-01-19,若内容或图片失效,请留言反馈。部分素材来自网络,若不小心影响到您的利益,请联系我们删除。

背景:

项目需求: 校对文章中出现的错敏词,三方系统会给出有问题的文字的下标,我们需要在原文档(docx)中替换有问题的词,将其改为建议修改的词。

思路:

我们的思路是先遍历所有的错敏词,然后在 Word 文档的段落和表格对象中进行扫描。我们会计算每个错敏词在文档中出现的次数。如果某个错敏词只出现了一次,那么我们可以直接进行全局替换。但如果某个错敏词出现了两次,这意味着相同的错敏词在不同的语境下可能并不都是错误的,因此我们不能直接进行全局替换,而是需要根据下标进行处理。

poi Maven 依赖

 <!-- POI 相关 -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>4.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.0.0</version>
        </dependency>

具体算法实现

public static void replaceText(XWPFDocument doc, List<Replacement> replacements) {

		for (int z = replacements.size(); z > 0 ; z--) {
			Replacement replacement = replacements.get(z-1);
			String targetWord = replacement.getTargetWord();
			String replacementWord = replacement.getReplacementWord();
			int startIndex = replacement.getStartIndex();

			int matchCount = 0;
			int currentIndex = 0;

			// 首先,计算 matchCount 并记录每次匹配的开始位置
			List<Integer> matchIndices = new ArrayList<>();
			List<IBodyElement> bodyElements = doc.getBodyElements();
			for (IBodyElement bodyElement : bodyElements) {
				if (bodyElement instanceof XWPFParagraph) {
					XWPFParagraph paragraph = (XWPFParagraph) bodyElement;
					List<XWPFRun> runs = paragraph.getRuns();
					for (XWPFRun run : runs) {
						String text = run.getText(0);
						if (text != null) {
							int indexInRun = text.indexOf(targetWord);
							while (indexInRun != -1) {
								matchCount++;
								matchIndices.add(currentIndex + indexInRun);
								indexInRun = text.indexOf(targetWord, indexInRun + 1);
							}
							currentIndex += text.length();
						}
					}
				} else if (bodyElement instanceof XWPFTable) {
					XWPFTable table = (XWPFTable) bodyElement;
					List<XWPFTableRow> rows = table.getRows();
					for (XWPFTableRow row : rows) {
						List<XWPFTableCell> cells = row.getTableCells();
						for (XWPFTableCell cell : cells) {
							String text = cell.getText();
							if (text != null) {
								int indexInCell = text.indexOf(targetWord);
								while (indexInCell != -1) {
									matchCount++;
									matchIndices.add(currentIndex + indexInCell);
									indexInCell = text.indexOf(targetWord, indexInCell + 1);
								}
								currentIndex += text.length();
							}
						}
					}
				}
			}

			// 然后,如果 matchCount > 1,根据 startIndex 替换目标词
			if (matchCount > 1) {
				currentIndex = 0;
				int replaceIndex = -1;
				for (int i = 0; i < matchIndices.size(); i++) {
					if (matchIndices.get(i) >= startIndex) {
						replaceIndex = matchIndices.get(i);
						break;
					}
				}
				if (replaceIndex == -1) {
					log.info("未找到从给定的 startIndex 开始的目标词 \"" + targetWord + "\"。");
					continue;
				}
				for (IBodyElement bodyElement : bodyElements) {
					if (bodyElement instanceof XWPFParagraph) {
						XWPFParagraph paragraph = (XWPFParagraph) bodyElement;
						List<XWPFRun> runs = paragraph.getRuns();

						for (int i = runs.size()-1; i > -1 ; i--) {
							XWPFRun xwpfRun = runs.get(i);
							String text = xwpfRun.getText(0);
							if (text != null) {
								if (currentIndex + text.length() > replaceIndex) {
									int indexInRun = text.indexOf(targetWord);
									if(indexInRun == -1){
										continue;
									}
									log.info("下标数据:{},{},{}",indexInRun,replacementWord,targetWord);
									text = text.substring(0, indexInRun) + replacementWord + text.substring(indexInRun + targetWord.length());
									xwpfRun.setText(text, 0);
									log.info("目标词-段落 \"" + targetWord + "\" 已被替换为 \"" + replacementWord + "\"。");
//									return;
								}
								currentIndex += text.length();
							}
						}

					} else if (bodyElement instanceof XWPFTable) {
						XWPFTable table = (XWPFTable) bodyElement;
						List<XWPFTableRow> rows = table.getRows();
						for (XWPFTableRow row : rows) {
							List<XWPFTableCell> cells = row.getTableCells();

							for (int i = cells.size()-1; i > -1 ; i--) {
								XWPFTableCell cell = cells.get(i);
								String text = cell.getText();
								if (text != null) {
									if (currentIndex + text.length() > replaceIndex) {
										text = text.replaceFirst(targetWord, replacementWord);
										cell.removeParagraph(0);
										cell.setText(text);
										log.info("目标词表格 \"" + targetWord + "\" 已被替换为 \"" + replacementWord + "\"。");
//										return;
									}
									currentIndex += text.length();
								}
							}
						}
					}
				}
			} else if (matchCount == 1) {
				// 如果只有一个匹配,直接替换
				for (IBodyElement bodyElement : bodyElements) {
					if (bodyElement instanceof XWPFParagraph) {
						XWPFParagraph paragraph = (XWPFParagraph) bodyElement;
						List<XWPFRun> runs = paragraph.getRuns();
						for (XWPFRun run : runs) {
							String text = run.getText(0);
							if (text != null && text.contains(targetWord)) {
								text = text.replace(targetWord, replacementWord);
								run.setText(text, 0);
								log.info("目标词-段落 \"" + targetWord + "\" 已被替换为 \"" + replacementWord + "\"。");
//								return;
							}
						}
					} else if (bodyElement instanceof XWPFTable) {
						XWPFTable table = (XWPFTable) bodyElement;
						List<XWPFTableRow> rows = table.getRows();
						for (XWPFTableRow row : rows) {
							List<XWPFTableCell> cells = row.getTableCells();
							for (XWPFTableCell cell : cells) {
								String text = cell.getText();
								if (text != null && text.contains(targetWord)) {
									text = text.replace(targetWord, replacementWord);
									cell.removeParagraph(0);
									cell.setText(text);
									log.info("目标词-表格 \"" + targetWord + "\" 已被替换为 \"" + replacementWord + "\"。");
//									return;
								}
							}
						}
					}
				}
			} else {
				log.info("未找到目标词 \"" + targetWord + "\"。");
			}
		}



	}
    
    
    
    public static class Replacement {
		private String targetWord;
		private String replacementWord;
		private int startIndex;

		public Replacement(String targetWord, String replacementWord, int startIndex) {
			this.targetWord = targetWord;
			this.replacementWord = replacementWord;
			this.startIndex = startIndex;
		}

		public String getTargetWord() {
			return targetWord;
		}

		public String getReplacementWord() {
			return replacementWord;
		}

		public int getStartIndex() {
			return startIndex;
		}
	}



调用

	@SneakyThrows
	public static void main(String[] args) {
		// 读取Word文档
		String inputFilePath = "/Users/zhangburui/Downloads/zpr_test.doc";
		String outputFilePath = "/Users/zhangburui/Downloads/abc_modified.doc";

		try {
			// 读取Word文件
//			XWPFDocument doc = new XWPFDocument(new FileInputStream(inputFilePath));
			HWPFDocument doc = new HWPFDocument(new FileInputStream(inputFilePath));
			List<Replacement> replacements = new ArrayList<>();
			replacements.add(new Replacement("人民法院法院", "replacement2", 59541));
			replacements.add(new Replacement("人民法院法院", "replacement1", 23218));

			// 调用替换方法
			replaceTextDoc(doc, replacements);

			// 保存修改后的Word文件
			FileOutputStream outputStream = new FileOutputStream(outputFilePath);
			doc.write(outputStream);
			outputStream.close();

			System.out.println("替换完成,并成功写回Word文件。");

		} catch (IOException e) {
			e.printStackTrace();
		}
	}
0

评论区