搭建ES,版本6.3.2,包含ik分词器
version: '2'
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:6.3.2
environment:
- TZ=Asia/Shanghai
- LANG=en_US.UTF-8
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ports:
- "59200:9200"
- "59300:9300"
volumes:
- ./plugins:/usr/share/elasticsearch/plugins
pom.xml文件加入依赖
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
<dependency>
<groupId>io.searchbox</groupId>
<artifactId>jest</artifactId>
<version>5.3.3</version>
</dependency>
建立ES的关联实体
package fun.gudu.modules.entity;
import io.searchbox.annotations.JestId;
import lombok.Data;
@Data
public class ArticleEntity {
@JestId
private String id;
private String author;
private String title;
private String path;
private String content;
private String fileFingerprint;
}
循环文件夹加入ES索引
package fun.gudu.modules.utils;
import cn.hutool.core.io.FileUtil;
import cn.hutool.poi.excel.ExcelReader;
import cn.hutool.poi.excel.ExcelUtil;
import com.alibaba.fastjson.JSONObject;
import fun.gudu.modules.entity.ArticleEntity;
import io.searchbox.client.JestClient;
import io.searchbox.core.Index;
import io.searchbox.core.Search;
import io.searchbox.core.SearchResult;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.List;
import java.util.Map;
@Slf4j
@Component
public class DirectoryRecurse {
@Autowired
private JestClient jestClient;
//读取文件内容转换为字符串
private String readToString(File file, String fileType) {
StringBuffer result = new StringBuffer();
switch (fileType) {
case "text/plain":
case "java":
case "c":
case "cpp":
case "log":
case "txt":
try (FileInputStream in = new FileInputStream(file)) {
Long filelength = file.length();
byte[] filecontent = new byte[filelength.intValue()];
in.read(filecontent);
result.append(new String(filecontent, "utf8"));
} catch (FileNotFoundException e) {
log.error("{}", e.getLocalizedMessage());
} catch (IOException e) {
log.error("{}", e.getLocalizedMessage());
}
break;
case "pdf":
try {
PDDocument document = PDDocument.load(file);
PDFTextStripper stripper = new PDFTextStripper();
result.append(stripper.getText(document));
} catch (Exception e) {
log.error("{}", e.getLocalizedMessage());
}
break;
case "xls":
case "xlsx":
ExcelReader reader = ExcelUtil.getReader(FileUtil.file(file.getPath()));
List<Map<String,Object>> readAll = reader.readAll();
for (Map<String, Object> map : readAll) {
result.append(map.toString());
}
break;
case "docx":
try {
FileInputStream fis = new FileInputStream(file);
XWPFDocument xdoc = new XWPFDocument(fis);
XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
result.append(extractor.getText());
extractor.close();
// OPCPackage opcPackage = POIXMLDocument.openPackage(file.getPath());
// POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
// result.append(extractor.getText());
// extractor.close();
// Document document = new Document(file.getPath());
// String text = document.getText();
// result.append(text);
} catch (Exception e) {
log.error("{}", e.getLocalizedMessage());
}
break;
case "doc":
//使用HWPF组件中WordExtractor类从Word文档中提取文本或段落
try {
FileInputStream fis = new FileInputStream(file);
HWPFDocument doc = new HWPFDocument(fis);
result.append(doc.getText());
fis.close();
} catch (Exception e) {
log.error("{}", e.getLocalizedMessage());
}
break;
default:
break;
}
return result.toString();
}
//判断是否已经索引
private JSONObject isIndex(File file) {
JSONObject result = new JSONObject();
//用MD5生成文件指纹,搜索该指纹是否已经索引
String fileFingerprint = Md5CaculateUtil.getMD5(file);
result.put("fileFingerprint", fileFingerprint);
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
searchSourceBuilder.query(QueryBuilders.termQuery("fileFingerprint", fileFingerprint));
Search search = new Search.Builder(searchSourceBuilder.toString()).addIndex("diskfile").addType("files").build();
try {
//执行
SearchResult searchResult = jestClient.execute(search);
if (searchResult.getTotal() == null) {
result.put("isIndex", false);
} else {
if (searchResult.getTotal() > 0) {
result.put("isIndex", true);
} else {
result.put("isIndex", false);
}
}
} catch (IOException e) {
log.error("{}", e.getLocalizedMessage());
}
return result;
}
//对文件目录及内容创建索引
private void createIndex(File file, String method) {
//忽略掉临时文件,以~$起始的文件名
if (file.getName().startsWith("~$")) {
return;
}
String fileType = null;
switch (method) {
case "ext":
String filename = file.getName();
String[] strArray = filename.split("\\.");
int suffixIndex = strArray.length - 1;
fileType = strArray[suffixIndex];
default:
break;
}
switch (fileType) {
case "text/plain":
case "java":
case "log":
case "c":
case "cpp":
case "txt":
case "pdf":
case "xls":
case "xlsx":
case "doc":
case "docx":
JSONObject isIndexResult = isIndex(file);
log.info("文件名:{},文件类型:{},MD5:{},建立索引:{}", file.getPath(), fileType, isIndexResult.getString("fileFingerprint"), isIndexResult.getBoolean("isIndex"));
if (isIndexResult.getBoolean("isIndex")) {
break;
}
//1\. 给ES中索引(保存)一个文档
ArticleEntity article = new ArticleEntity();
article.setTitle(file.getName());
article.setAuthor(file.getParent());
article.setPath(file.getPath());
article.setContent(readToString(file, fileType));
article.setFileFingerprint(isIndexResult.getString("fileFingerprint"));
//2\. 构建一个索引
Index index = new Index.Builder(article).index("diskfile").type("files").build();
try {
//3\. 执行
if (!jestClient.execute(index).getId().isEmpty()) {
log.info("构建索引成功!");
}
} catch (IOException e) {
log.error("{}", e.getLocalizedMessage());
}
break;
default:
break;
}
}
public void find(String pathName) throws IOException {
//获取pathName的File对象
File dirFile = new File(pathName);
//判断该文件或目录是否存在,不存在时在控制台输出提醒
if (!dirFile.exists()) {
log.info("do not exit");
return;
}
//判断如果不是一个目录,就判断是不是一个文件,时文件则输出文件路径
if (!dirFile.isDirectory()) {
if (dirFile.isFile()) {
createIndex(dirFile, "ext");
}
return;
}
//获取此目录下的所有文件名与目录名
String[] fileList = dirFile.list();
for (int i = 0; i < fileList.length; i++) {
//遍历文件目录
String string = fileList[i];
File file = new File(dirFile.getPath(), string);
//如果是一个目录,输出目录名后,进行递归
if (file.isDirectory()) {
//递归
find(file.getCanonicalPath());
} else {
createIndex(file, "ext");
}
}
}
}
建立索引
@RequestMapping(value = "/createindex",method = RequestMethod.GET)
public List createindex(){
try {
directoryRecurse.find(indexRoot);
// directoryRecurse.writeIndexStatus();
} catch (IOException e) {
log.error("{}",e.getLocalizedMessage());
}
return null;
}
搜索
package org.jeecg.modules.system.controller;
import io.searchbox.client.JestClient;
import io.searchbox.core.Search;
import io.searchbox.core.SearchResult;
import lombok.extern.slf4j.Slf4j;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.jeecg.common.api.vo.Result;
import org.jeecg.modules.system.vo.DocumentVo;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.*;
import java.io.IOException;
import java.util.*;
@Slf4j
@RestController
@RequestMapping("/document")
public class DocumentController {
@Autowired
private JestClient jestClient;
@RequestMapping(value = "/search",method = RequestMethod.GET)
public Result<List> search(@RequestParam(name="keyword") String keyword){
Result<List> result = new Result<>();
// 精确,词语
MatchPhraseQueryBuilder matchPhraseQueryBuilder = QueryBuilders.matchPhraseQuery("content", keyword);
MatchPhraseQueryBuilder matchPhraseQueryBuilder1 = QueryBuilders.matchPhraseQuery("title", keyword);
BoolQueryBuilder childBoolQueryBuilder = new BoolQueryBuilder().should(matchPhraseQueryBuilder).should(matchPhraseQueryBuilder1);
BoolQueryBuilder boolQueryBuilder = new BoolQueryBuilder();
boolQueryBuilder.must(childBoolQueryBuilder);
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
searchSourceBuilder.query(boolQueryBuilder);
// 拆分,词语
// SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
// searchSourceBuilder.query(QueryBuilders.queryStringQuery(keyword));
// 字段完整精确
QueryBuilder queryBuilder = QueryBuilders.termQuery("user", "kimchy");
QueryBUilder queryBuilder = QueryBuilders.termQuery("user", "kimchy", "wenbronk", "vini");
// 根据ID、索引和类型删除记录
// Delete build = new Delete.Builder("CK4R-HUBi-FTUankmkcE").index("diskfile").type("files").build();
// jestClient.execute(build);
HighlightBuilder highlightBuilder = new HighlightBuilder();
highlightBuilder.preTags("<strong style='font-size:18px'>").postTags("</strong>");
//path属性高亮度
HighlightBuilder.Field highlightPath = new HighlightBuilder.Field("path");
highlightPath.highlighterType("unified");
highlightBuilder.field(highlightPath);
//title字段高亮度
HighlightBuilder.Field highlightTitle = new HighlightBuilder.Field("title");
highlightTitle.highlighterType("unified");
highlightBuilder.field(highlightTitle);
//content字段高亮度
HighlightBuilder.Field highlightContent = new HighlightBuilder.Field("content");
highlightContent.highlighterType("unified");
highlightBuilder.field(highlightContent);
//高亮度配置生效
searchSourceBuilder.highlighter(highlightBuilder);
log.info("搜索条件{}",searchSourceBuilder.toString());
//构建搜索功能
Search search = new Search.Builder(searchSourceBuilder.toString()).addIndex( "diskfile" ).addType( "files" ).build();
try {
//执行
SearchResult searchResult = jestClient.execute( search );
List<SearchResult.Hit<DocumentVo, Void>> hits = searchResult.getHits(DocumentVo.class);
result.setResult(hits);
return result;
} catch (IOException e) {
log.error("{}",e.getLocalizedMessage());
}
return null;
}
}