搭建ES,版本6.3.2,包含ik分词器

version: '2'
services:
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:6.3.2
    environment:
      - TZ=Asia/Shanghai
      - LANG=en_US.UTF-8
      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
    ports:
      - "59200:9200"
      - "59300:9300"
    volumes:
      - ./plugins:/usr/share/elasticsearch/plugins

pom.xml文件加入依赖

<dependency>
  <groupId>org.springframework.boot</groupId>
  <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
<dependency>
  <groupId>io.searchbox</groupId>
  <artifactId>jest</artifactId>
  <version>5.3.3</version>
</dependency>

建立ES的关联实体

package fun.gudu.modules.entity;

import io.searchbox.annotations.JestId;
import lombok.Data;

@Data
public class ArticleEntity {
    @JestId
    private String id;
    private String author;
    private String title;
    private String path;
    private String content;
    private String fileFingerprint;
}

循环文件夹加入ES索引

package fun.gudu.modules.utils;

import cn.hutool.core.io.FileUtil;
import cn.hutool.poi.excel.ExcelReader;
import cn.hutool.poi.excel.ExcelUtil;
import com.alibaba.fastjson.JSONObject;
import fun.gudu.modules.entity.ArticleEntity;
import io.searchbox.client.JestClient;
import io.searchbox.core.Index;
import io.searchbox.core.Search;
import io.searchbox.core.SearchResult;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.List;
import java.util.Map;

@Slf4j
@Component
public class DirectoryRecurse {

    @Autowired
    private JestClient jestClient;

    //读取文件内容转换为字符串
    private String readToString(File file, String fileType) {
        StringBuffer result = new StringBuffer();
        switch (fileType) {
            case "text/plain":
            case "java":
            case "c":
            case "cpp":
            case "log":
            case "txt":
                try (FileInputStream in = new FileInputStream(file)) {
                    Long filelength = file.length();
                    byte[] filecontent = new byte[filelength.intValue()];
                    in.read(filecontent);
                    result.append(new String(filecontent, "utf8"));
                } catch (FileNotFoundException e) {
                    log.error("{}", e.getLocalizedMessage());
                } catch (IOException e) {
                    log.error("{}", e.getLocalizedMessage());
                }
                break;
            case "pdf":
                try {
                    PDDocument document = PDDocument.load(file);
                    PDFTextStripper stripper = new PDFTextStripper();
                    result.append(stripper.getText(document));
                } catch (Exception e) {
                    log.error("{}", e.getLocalizedMessage());
                }
                break;
            case "xls":
            case "xlsx":
                ExcelReader reader = ExcelUtil.getReader(FileUtil.file(file.getPath()));
                List<Map<String,Object>> readAll = reader.readAll();
                for (Map<String, Object> map : readAll) {
                    result.append(map.toString());
                }
                break;
            case "docx":
                try {
                    FileInputStream fis = new FileInputStream(file);
                    XWPFDocument xdoc = new XWPFDocument(fis);
                    XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
                    result.append(extractor.getText());
                    extractor.close();

//                    OPCPackage opcPackage = POIXMLDocument.openPackage(file.getPath());
//                    POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
//                    result.append(extractor.getText());
//                    extractor.close();

//                    Document document = new Document(file.getPath());
//                    String text = document.getText();
//                    result.append(text);
                } catch (Exception e) {
                    log.error("{}", e.getLocalizedMessage());
                }
                break;
            case "doc":
                //使用HWPF组件中WordExtractor类从Word文档中提取文本或段落
                try {
                    FileInputStream fis = new FileInputStream(file);
                    HWPFDocument doc = new HWPFDocument(fis);
                    result.append(doc.getText());
                    fis.close();
                } catch (Exception e) {
                    log.error("{}", e.getLocalizedMessage());
                }
                break;
            default:
                break;
        }
        return result.toString();
    }

    //判断是否已经索引
    private JSONObject isIndex(File file) {
        JSONObject result = new JSONObject();
        //用MD5生成文件指纹,搜索该指纹是否已经索引
        String fileFingerprint = Md5CaculateUtil.getMD5(file);
        result.put("fileFingerprint", fileFingerprint);
        SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
        searchSourceBuilder.query(QueryBuilders.termQuery("fileFingerprint", fileFingerprint));
        Search search = new Search.Builder(searchSourceBuilder.toString()).addIndex("diskfile").addType("files").build();
        try {
            //执行
            SearchResult searchResult = jestClient.execute(search);
            if (searchResult.getTotal() == null) {
                result.put("isIndex", false);
            } else {
                if (searchResult.getTotal() > 0) {
                    result.put("isIndex", true);
                } else {
                    result.put("isIndex", false);
                }
            }
        } catch (IOException e) {
            log.error("{}", e.getLocalizedMessage());
        }
        return result;
    }

    //对文件目录及内容创建索引
    private void createIndex(File file, String method) {
        //忽略掉临时文件,以~$起始的文件名
        if (file.getName().startsWith("~$")) {
            return;
        }

        String fileType = null;
        switch (method) {
            case "ext":
                String filename = file.getName();
                String[] strArray = filename.split("\\.");
                int suffixIndex = strArray.length - 1;
                fileType = strArray[suffixIndex];
            default:
                break;
        }

        switch (fileType) {
            case "text/plain":
            case "java":
            case "log":
            case "c":
            case "cpp":
            case "txt":
            case "pdf":
            case "xls":
            case "xlsx":
            case "doc":
            case "docx":
                JSONObject isIndexResult = isIndex(file);
                log.info("文件名:{},文件类型:{},MD5:{},建立索引:{}", file.getPath(), fileType, isIndexResult.getString("fileFingerprint"), isIndexResult.getBoolean("isIndex"));

                if (isIndexResult.getBoolean("isIndex")) {
                    break;
                }
                //1\. 给ES中索引(保存)一个文档
                ArticleEntity article = new ArticleEntity();
                article.setTitle(file.getName());
                article.setAuthor(file.getParent());
                article.setPath(file.getPath());
                article.setContent(readToString(file, fileType));
                article.setFileFingerprint(isIndexResult.getString("fileFingerprint"));
                //2\. 构建一个索引
                Index index = new Index.Builder(article).index("diskfile").type("files").build();
                try {
                    //3\. 执行
                    if (!jestClient.execute(index).getId().isEmpty()) {
                        log.info("构建索引成功!");
                    }
                } catch (IOException e) {
                    log.error("{}", e.getLocalizedMessage());
                }
                break;
            default:
                break;
        }
    }

    public void find(String pathName) throws IOException {
        //获取pathName的File对象
        File dirFile = new File(pathName);

        //判断该文件或目录是否存在,不存在时在控制台输出提醒
        if (!dirFile.exists()) {
            log.info("do not exit");
            return;
        }

        //判断如果不是一个目录,就判断是不是一个文件,时文件则输出文件路径
        if (!dirFile.isDirectory()) {
            if (dirFile.isFile()) {
                createIndex(dirFile, "ext");
            }
            return;
        }

        //获取此目录下的所有文件名与目录名
        String[] fileList = dirFile.list();

        for (int i = 0; i < fileList.length; i++) {
            //遍历文件目录
            String string = fileList[i];
            File file = new File(dirFile.getPath(), string);
            //如果是一个目录,输出目录名后,进行递归
            if (file.isDirectory()) {
                //递归
                find(file.getCanonicalPath());
            } else {
                createIndex(file, "ext");
            }
        }
    }
}

建立索引

@RequestMapping(value = "/createindex",method = RequestMethod.GET)
public List createindex(){
    try {
        directoryRecurse.find(indexRoot);
        // directoryRecurse.writeIndexStatus();
    } catch (IOException e) {
        log.error("{}",e.getLocalizedMessage());
    }
    return null;
}

搜索

package org.jeecg.modules.system.controller;

import io.searchbox.client.JestClient;
import io.searchbox.core.Search;
import io.searchbox.core.SearchResult;
import lombok.extern.slf4j.Slf4j;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.jeecg.common.api.vo.Result;
import org.jeecg.modules.system.vo.DocumentVo;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.*;
import java.io.IOException;
import java.util.*;

@Slf4j
@RestController
@RequestMapping("/document")
public class DocumentController {

    @Autowired
    private JestClient jestClient;

    @RequestMapping(value = "/search",method = RequestMethod.GET)
    public Result<List> search(@RequestParam(name="keyword") String keyword){
        Result<List> result = new Result<>();
        // 精确,词语
        MatchPhraseQueryBuilder matchPhraseQueryBuilder = QueryBuilders.matchPhraseQuery("content", keyword);
        MatchPhraseQueryBuilder matchPhraseQueryBuilder1 = QueryBuilders.matchPhraseQuery("title", keyword);
        BoolQueryBuilder childBoolQueryBuilder = new BoolQueryBuilder().should(matchPhraseQueryBuilder).should(matchPhraseQueryBuilder1);
        BoolQueryBuilder boolQueryBuilder = new BoolQueryBuilder();
        boolQueryBuilder.must(childBoolQueryBuilder);
        SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
        searchSourceBuilder.query(boolQueryBuilder);

        // 拆分,词语
//        SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
//        searchSourceBuilder.query(QueryBuilders.queryStringQuery(keyword));

        // 字段完整精确
        QueryBuilder queryBuilder = QueryBuilders.termQuery("user", "kimchy");
        QueryBUilder queryBuilder = QueryBuilders.termQuery("user", "kimchy", "wenbronk", "vini");

        // 根据ID、索引和类型删除记录
//        Delete build = new Delete.Builder("CK4R-HUBi-FTUankmkcE").index("diskfile").type("files").build();
//        jestClient.execute(build);



        HighlightBuilder highlightBuilder = new HighlightBuilder();
        highlightBuilder.preTags("<strong style='font-size:18px'>").postTags("</strong>");
        //path属性高亮度
        HighlightBuilder.Field highlightPath = new HighlightBuilder.Field("path");
        highlightPath.highlighterType("unified");
        highlightBuilder.field(highlightPath);
        //title字段高亮度
        HighlightBuilder.Field highlightTitle = new HighlightBuilder.Field("title");
        highlightTitle.highlighterType("unified");
        highlightBuilder.field(highlightTitle);
        //content字段高亮度
        HighlightBuilder.Field highlightContent = new HighlightBuilder.Field("content");
        highlightContent.highlighterType("unified");
        highlightBuilder.field(highlightContent);

        //高亮度配置生效
        searchSourceBuilder.highlighter(highlightBuilder);

        log.info("搜索条件{}",searchSourceBuilder.toString());

        //构建搜索功能
        Search search = new Search.Builder(searchSourceBuilder.toString()).addIndex( "diskfile" ).addType( "files" ).build();
        try {
            //执行
            SearchResult searchResult = jestClient.execute( search );
            List<SearchResult.Hit<DocumentVo, Void>> hits = searchResult.getHits(DocumentVo.class);
            result.setResult(hits);
            return result;
        } catch (IOException e) {
            log.error("{}",e.getLocalizedMessage());
        }
        return null;
    }
}

# To Be Continued!😎

Last Updated: 11/24/2020, 3:50:09 PM