0


Lucene全文检索

Lucene全文检索

Lucene 是一个基于 Java 的全文信息检索工具包,目前主流的搜索系统 Elasticsearch 和 solr 都是基于 lucene 的索引和搜索能力进行。

Solr与Lucene的区别:

Solr和Lucene的本质区别三点:搜索服务器,企业级和管理。

Lucene本质上是搜索库,不是独立的应用程序,而Solr是。

Lucene专注于搜索底层的建设,而Solr专注与企业应用。

Lucene不负责支撑搜索服务所必须的管理,而Slor负责

所以说,一句话概括solr是Lucene面向企业搜索应用的扩展,如果Lucene数据量超过10万就会有点力不从心了

ES:
ES是对apache lucene的封装。
ES是elasticSearch的缩写,它是一个实时的分布式的查询和分析引擎。它是基于apache lucene开发的。
2:ES的目标是让全文搜索变得简单
3:ES可以支持横向的扩展,支持pb级别的结构和非机构化的数据处理。
4:使用ES可以以前所未有的速度来处理大数据。

今天的主题是Lucene:

Lucene生成索引:

packagecom.zking.test.lucene;/**
 * 生成索引测试
 * @author Administrator
 *
 */publicclassDemo1{publicstaticvoidmain(String[] args){//        索引文件将要存放的位置String indexDir ="E:\\temp\\test\\lucene\\demo1";//        数据源地址String dataDir ="E:\\temp\\test\\lucene\\demo1\\data";IndexCreate ic =null;try{
            ic =newIndexCreate(indexDir);long start =System.currentTimeMillis();int num = ic.index(dataDir);long end =System.currentTimeMillis();System.out.println("检索指定路径下"+num+"个文件,一共花费了"+(end-start)+"毫秒");}catch(Exception e){
            e.printStackTrace();}finally{try{
                ic.closeIndexWriter();}catch(Exception e){
                e.printStackTrace();}}}}

配合Demo1的实现:

packagecom.zking.test.lucene;importjava.io.File;importjava.io.FileReader;importjava.nio.file.Paths;importorg.apache.lucene.analysis.Analyzer;importorg.apache.lucene.analysis.standard.StandardAnalyzer;importorg.apache.lucene.document.Document;importorg.apache.lucene.document.Field;importorg.apache.lucene.document.TextField;importorg.apache.lucene.index.IndexWriter;importorg.apache.lucene.index.IndexWriterConfig;importorg.apache.lucene.store.FSDirectory;/**
 * 配合Demo1.java进行lucene的helloword实现
 * @author Administrator
 *
 */publicclassIndexCreate{privateIndexWriter indexWriter;/**
     * 1、构造方法 实例化IndexWriter
     * @param indexDir 索引文件存放的地址
     * @throws Exception
     */publicIndexCreate(String indexDir)throwsException{//        获取索引文件的存放地址对象FSDirectory dir =FSDirectory.open(Paths.get(indexDir));//        标准分词器(针对英文)Analyzer analyzer =newStandardAnalyzer();//        索引输出流配置对象IndexWriterConfig conf =newIndexWriterConfig(analyzer); 
        indexWriter =newIndexWriter(dir, conf);}/**
     * 2、关闭索引输出流
     * @throws Exception
     */publicvoidcloseIndexWriter()throwsException{
        indexWriter.close();}/**
     * 3、索引指定路径下的所有文件
     * @param dataDir 数据源
     * @return
     * @throws Exception
     */publicintindex(String dataDir)throwsException{File[] files =newFile(dataDir).listFiles();for(File file : files){indexFile(file);}return indexWriter.numDocs();}/**
     * 4、索引指定的文件
     * @param file
     * @throws Exception
     */privatevoidindexFile(File file)throwsException{System.out.println("被索引文件的全路径:"+file.getCanonicalPath());Document doc =getDocument(file);
        indexWriter.addDocument(doc);}/**
     * 5、获取文档(索引文件中包含的重要信息,key-value的形式)
     * @param file
     * @return
     * @throws Exception
     */privateDocumentgetDocument(File file)throwsException{Document doc =newDocument();
        doc.add(newTextField("contents",newFileReader(file)));//        Field.Store.YES是否存储到硬盘
        doc.add(newTextField("fullPath", file.getCanonicalPath(),Field.Store.YES));
        doc.add(newTextField("fileName", file.getName(),Field.Store.YES));return doc;}}

Lucene查询索引:

packagecom.zking.test.lucene;/**
 * 查询索引测试
 * @author Administrator
 *
 */publicclassDemo2{publicstaticvoidmain(String[] args){String indexDir ="E:\\temp\\test\\lucene\\demo1";String q ="EarlyTerminating-Collector";try{IndexUse.search(indexDir, q);}catch(Exception e){
            e.printStackTrace();}}}

配合Demo2的实现:

packagecom.zking.test.lucene;importjava.nio.file.Paths;importorg.apache.lucene.analysis.Analyzer;importorg.apache.lucene.analysis.standard.StandardAnalyzer;importorg.apache.lucene.document.Document;importorg.apache.lucene.index.DirectoryReader;importorg.apache.lucene.index.IndexReader;importorg.apache.lucene.queryparser.classic.QueryParser;importorg.apache.lucene.search.IndexSearcher;importorg.apache.lucene.search.Query;importorg.apache.lucene.search.ScoreDoc;importorg.apache.lucene.search.TopDocs;importorg.apache.lucene.store.FSDirectory;/**
 * 配合Demo2.java进行lucene的helloword实现
 * @author Administrator
 *
 */publicclassIndexUse{/**
     * 通过关键字在索引目录中查询
     * @param indexDir    索引文件所在目录
     * @param q    关键字
     */publicstaticvoidsearch(String indexDir,String q)throwsException{FSDirectory indexDirectory =FSDirectory.open(Paths.get(indexDir));//        注意:索引输入流不是new出来的,是通过目录读取工具类打开的IndexReader indexReader =DirectoryReader.open(indexDirectory);//        获取索引搜索对象IndexSearcher indexSearcher =newIndexSearcher(indexReader);Analyzer analyzer =newStandardAnalyzer();QueryParser queryParser =newQueryParser("contents", analyzer);//        获取符合关键字的查询对象Query query = queryParser.parse(q);long start=System.currentTimeMillis();//        获取关键字出现的前十次TopDocs topDocs = indexSearcher.search(query ,10);long end=System.currentTimeMillis();System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+topDocs.totalHits+"个记录");for(ScoreDoc scoreDoc : topDocs.scoreDocs){int docID = scoreDoc.doc;//            索引搜索对象通过文档下标获取文档Document doc = indexSearcher.doc(docID);System.out.println("通过索引文件:"+doc.get("fullPath")+"拿数据");}
        
        indexReader.close();}}

对索引的增删改:

packagecom.zking.test.lucene;importjava.nio.file.Paths;importorg.apache.lucene.analysis.Analyzer;importorg.apache.lucene.analysis.standard.StandardAnalyzer;importorg.apache.lucene.document.Document;importorg.apache.lucene.document.Field;importorg.apache.lucene.document.StringField;importorg.apache.lucene.document.TextField;importorg.apache.lucene.index.IndexWriter;importorg.apache.lucene.index.IndexWriterConfig;importorg.apache.lucene.index.Term;importorg.apache.lucene.store.FSDirectory;importorg.junit.Before;importorg.junit.Test;/**
 * 构建索引
 *     对索引的增删改
 * @author Administrator
 *
 */publicclassDemo3{privateString ids[]={"1","2","3"};privateString citys[]={"qingdao","nanjing","shanghai"};privateString descs[]={"Qingdao is a beautiful city.","Nanjing is a city of culture.","Shanghai is a bustling city."};privateFSDirectory dir;/**
     * 每次都生成索引文件
     * @throws Exception
     */@BeforepublicvoidsetUp()throwsException{
        dir  =FSDirectory.open(Paths.get("E:\\temp\\test\\lucene\\demo2\\indexDir"));IndexWriter indexWriter =getIndexWriter();for(int i =0; i < ids.length; i++){Document doc =newDocument();
            doc.add(newStringField("id", ids[i],Field.Store.YES));
            doc.add(newStringField("city", citys[i],Field.Store.YES));
            doc.add(newTextField("desc", descs[i],Field.Store.NO));
            indexWriter.addDocument(doc);}
        indexWriter.close();}/**
     * 获取索引输出流
     * @return
     * @throws Exception
     */privateIndexWritergetIndexWriter()throwsException{Analyzer analyzer =newStandardAnalyzer();IndexWriterConfig conf =newIndexWriterConfig(analyzer);returnnewIndexWriter(dir, conf );}/**
     * 测试写了几个索引文件
     * @throws Exception
     */@TestpublicvoidgetWriteDocNum()throwsException{IndexWriter indexWriter =getIndexWriter();System.out.println("索引目录下生成"+indexWriter.numDocs()+"个索引文件");}/**
     * 打上标记,该索引实际并未删除
     * @throws Exception
     */@TestpublicvoiddeleteDocBeforeMerge()throwsException{IndexWriter indexWriter =getIndexWriter();System.out.println("最大文档数:"+indexWriter.maxDoc());
        indexWriter.deleteDocuments(newTerm("id","1"));
        indexWriter.commit();System.out.println("最大文档数:"+indexWriter.maxDoc());System.out.println("实际文档数:"+indexWriter.numDocs());
        indexWriter.close();}/**
     * 对应索引文件已经删除,但是该版本的分词会保留
     * @throws Exception
     */@TestpublicvoiddeleteDocAfterMerge()throwsException{//        https://blog.csdn.net/asdfsadfasdfsa/article/details/78820030//        org.apache.lucene.store.LockObtainFailedException: Lock held by this virtual machine:indexWriter是单例的、线程安全的,不允许打开多个。IndexWriter indexWriter =getIndexWriter();System.out.println("最大文档数:"+indexWriter.maxDoc());
        indexWriter.deleteDocuments(newTerm("id","1"));
        indexWriter.forceMergeDeletes();//强制删除
        indexWriter.commit();System.out.println("最大文档数:"+indexWriter.maxDoc());System.out.println("实际文档数:"+indexWriter.numDocs());
        indexWriter.close();}/**
     * 测试更新索引
     * @throws Exception
     */@TestpublicvoidtestUpdate()throwsException{IndexWriter writer=getIndexWriter();Document doc=newDocument();
        doc.add(newStringField("id","1",Field.Store.YES));
        doc.add(newStringField("city","qingdao",Field.Store.YES));
        doc.add(newTextField("desc","dsss is a city.",Field.Store.NO));
        writer.updateDocument(newTerm("id","1"), doc);
        writer.close();}}

文档域加权:

packagecom.zking.test.lucene;importjava.nio.file.Paths;importorg.apache.lucene.analysis.Analyzer;importorg.apache.lucene.analysis.standard.StandardAnalyzer;importorg.apache.lucene.document.Document;importorg.apache.lucene.document.Field;importorg.apache.lucene.document.StringField;importorg.apache.lucene.document.TextField;importorg.apache.lucene.index.DirectoryReader;importorg.apache.lucene.index.IndexReader;importorg.apache.lucene.index.IndexWriter;importorg.apache.lucene.index.IndexWriterConfig;importorg.apache.lucene.index.Term;importorg.apache.lucene.search.IndexSearcher;importorg.apache.lucene.search.Query;importorg.apache.lucene.search.ScoreDoc;importorg.apache.lucene.search.TermQuery;importorg.apache.lucene.search.TopDocs;importorg.apache.lucene.store.Directory;importorg.apache.lucene.store.FSDirectory;importorg.junit.Before;importorg.junit.Test;/**
 * 文档域加权
 * @author Administrator
 *
 */publicclassDemo4{privateString ids[]={"1","2","3","4"};privateString authors[]={"Jack","Marry","John","Json"};privateString positions[]={"accounting","technician","salesperson","boss"};privateString titles[]={"Java is a good language.","Java is a cross platform language","Java powerful","You should learn java"};privateString contents[]={"If possible, use the same JRE major version at both index and search time.","When upgrading to a different JRE major version, consider re-indexing. ","Different JRE major versions may implement different versions of Unicode,","For example: with Java 1.4, `LetterTokenizer` will split around the character U+02C6,"};privateDirectory dir;//索引文件目录@BeforepublicvoidsetUp()throwsException{
        dir =FSDirectory.open(Paths.get("E:\\temp\\test\\lucene\\demo3\\indexDir"));IndexWriter writer =getIndexWriter();for(int i =0; i < authors.length; i++){Document doc =newDocument();
            doc.add(newStringField("id", ids[i],Field.Store.YES));
            doc.add(newStringField("author", authors[i],Field.Store.YES));
            doc.add(newStringField("position", positions[i],Field.Store.YES));TextField textField =newTextField("title", titles[i],Field.Store.YES);//            Json投钱做广告,把排名刷到第一了if("boss".equals(positions[i])){
                textField.setBoost(2f);//设置权重,默认为1}
            
            doc.add(textField);//            TextField会分词,StringField不会分词
            doc.add(newTextField("content", contents[i],Field.Store.NO));
            writer.addDocument(doc);}
        writer.close();}privateIndexWritergetIndexWriter()throwsException{Analyzer analyzer =newStandardAnalyzer();IndexWriterConfig conf =newIndexWriterConfig(analyzer);returnnewIndexWriter(dir, conf);}@Testpublicvoidindex()throwsException{IndexReader reader =DirectoryReader.open(dir);IndexSearcher searcher =newIndexSearcher(reader);String fieldName ="title";String keyWord ="java";Term t =newTerm(fieldName, keyWord);Query query =newTermQuery(t);TopDocs hits = searcher.search(query,10);System.out.println("关键字:‘"+keyWord+"’命中了"+hits.totalHits+"次");for(ScoreDoc scoreDoc : hits.scoreDocs){Document doc = searcher.doc(scoreDoc.doc);System.out.println(doc.get("author"));}}}

特定项搜索和查询表达式(queryParser):

packagecom.zking.test.lucene;importjava.io.IOException;importjava.nio.file.Paths;importorg.apache.lucene.analysis.standard.StandardAnalyzer;importorg.apache.lucene.document.Document;importorg.apache.lucene.index.DirectoryReader;importorg.apache.lucene.index.IndexReader;importorg.apache.lucene.index.Term;importorg.apache.lucene.queryparser.classic.ParseException;importorg.apache.lucene.queryparser.classic.QueryParser;importorg.apache.lucene.search.IndexSearcher;importorg.apache.lucene.search.NumericRangeQuery;importorg.apache.lucene.search.ScoreDoc;importorg.apache.lucene.search.TermQuery;importorg.apache.lucene.search.TopDocs;importorg.apache.lucene.store.FSDirectory;importorg.junit.Before;importorg.junit.Test;/**
 * 特定项搜索
 * 查询表达式(queryParser)
 * @author Administrator
 *
 */publicclassDemo5{@BeforepublicvoidsetUp(){// 索引文件将要存放的位置String indexDir ="E:\\temp\\test\\lucene\\demo4";// 数据源地址String dataDir ="E:\\temp\\test\\lucene\\demo4\\data";IndexCreate ic =null;try{
            ic =newIndexCreate(indexDir);long start =System.currentTimeMillis();int num = ic.index(dataDir);long end =System.currentTimeMillis();System.out.println("检索指定路径下"+ num +"个文件,一共花费了"+(end - start)+"毫秒");}catch(Exception e){
            e.printStackTrace();}finally{try{
                ic.closeIndexWriter();}catch(Exception e){
                e.printStackTrace();}}}/**
     * 特定项搜索
     */@TestpublicvoidtestTermQuery(){String indexDir ="E:\\temp\\test\\lucene\\demo4";String fld ="contents";String text ="indexformattoooldexception";//        特定项片段名和关键字Term t  =newTerm(fld , text);TermQuery tq =newTermQuery(t  );try{FSDirectory indexDirectory =FSDirectory.open(Paths.get(indexDir));//            注意:索引输入流不是new出来的,是通过目录读取工具类打开的IndexReader indexReader =DirectoryReader.open(indexDirectory);//            获取索引搜索对象IndexSearcher is =newIndexSearcher(indexReader);TopDocs hits = is.search(tq,100);//            System.out.println(hits.totalHits);for(ScoreDoc scoreDoc: hits.scoreDocs){Document doc = is.doc(scoreDoc.doc);System.out.println("文件"+doc.get("fullPath")+"中含有该关键字");}}catch(IOException e){
            e.printStackTrace();}}/**
     * 查询表达式(queryParser)
     */@TestpublicvoidtestQueryParser(){String indexDir ="E:\\temp\\test\\lucene\\demo4";//        获取查询解析器(通过哪种分词器去解析哪种片段)QueryParser queryParser =newQueryParser("contents",newStandardAnalyzer());try{FSDirectory indexDirectory =FSDirectory.open(Paths.get(indexDir));//            注意:索引输入流不是new出来的,是通过目录读取工具类打开的IndexReader indexReader =DirectoryReader.open(indexDirectory);//            获取索引搜索对象IndexSearcher is =newIndexSearcher(indexReader);//            由解析器去解析对应的关键字TopDocs hits = is.search(queryParser.parse("indexformattoooldexception"),100);for(ScoreDoc scoreDoc: hits.scoreDocs){Document doc = is.doc(scoreDoc.doc);System.out.println("文件"+doc.get("fullPath")+"中含有该关键字");}}catch(IOException e){
            e.printStackTrace();}catch(ParseException e){// TODO Auto-generated catch block
            e.printStackTrace();}}}

指定数字范围查询和指定字符串开头字母查询(prefixQuery):

packagecom.zking.test.lucene;importjava.nio.file.Paths;importorg.apache.lucene.analysis.Analyzer;importorg.apache.lucene.analysis.standard.StandardAnalyzer;importorg.apache.lucene.document.Document;importorg.apache.lucene.document.Field;importorg.apache.lucene.document.IntField;importorg.apache.lucene.document.StringField;importorg.apache.lucene.document.TextField;importorg.apache.lucene.index.DirectoryReader;importorg.apache.lucene.index.IndexReader;importorg.apache.lucene.index.IndexWriter;importorg.apache.lucene.index.IndexWriterConfig;importorg.apache.lucene.index.Term;importorg.apache.lucene.search.BooleanClause;importorg.apache.lucene.search.BooleanQuery;importorg.apache.lucene.search.IndexSearcher;importorg.apache.lucene.search.NumericRangeQuery;importorg.apache.lucene.search.PrefixQuery;importorg.apache.lucene.search.ScoreDoc;importorg.apache.lucene.search.TopDocs;importorg.apache.lucene.store.FSDirectory;importorg.junit.Before;importorg.junit.Test;/**
 * 指定数字范围查询
 * 指定字符串开头字母查询(prefixQuery)
 * @author Administrator
 *
 */publicclassDemo6{privateint ids[]={1,2,3};privateString citys[]={"qingdao","nanjing","shanghai"};privateString descs[]={"Qingdao is a beautiful city.","Nanjing is a city of culture.","Shanghai is a bustling city."};privateFSDirectory dir;/**
     * 每次都生成索引文件
     * @throws Exception
     */@BeforepublicvoidsetUp()throwsException{
        dir  =FSDirectory.open(Paths.get("E:\\temp\\test\\lucene\\demo2\\indexDir"));IndexWriter indexWriter =getIndexWriter();for(int i =0; i < ids.length; i++){Document doc =newDocument();
            doc.add(newIntField("id", ids[i],Field.Store.YES));
            doc.add(newStringField("city", citys[i],Field.Store.YES));
            doc.add(newTextField("desc", descs[i],Field.Store.NO));
            indexWriter.addDocument(doc);}
        indexWriter.close();}/**
     * 获取索引输出流
     * @return
     * @throws Exception
     */privateIndexWritergetIndexWriter()throwsException{Analyzer analyzer =newStandardAnalyzer();IndexWriterConfig conf =newIndexWriterConfig(analyzer);returnnewIndexWriter(dir, conf );}/**
     * 指定数字范围查询
     * @throws Exception
     */@TestpublicvoidtestNumericRangeQuery()throwsException{IndexReader reader =DirectoryReader.open(dir);IndexSearcher is =newIndexSearcher(reader);NumericRangeQuery<Integer> query=NumericRangeQuery.newIntRange("id",1,2,true,true);TopDocs hits=is.search(query,10);for(ScoreDoc scoreDoc:hits.scoreDocs){Document doc=is.doc(scoreDoc.doc);System.out.println(doc.get("id"));System.out.println(doc.get("city"));System.out.println(doc.get("desc"));}}/**
     * 指定字符串开头字母查询(prefixQuery)
     * @throws Exception
     */@TestpublicvoidtestPrefixQuery()throwsException{IndexReader reader =DirectoryReader.open(dir);IndexSearcher is =newIndexSearcher(reader);PrefixQuery query=newPrefixQuery(newTerm("city","n"));TopDocs hits=is.search(query,10);for(ScoreDoc scoreDoc:hits.scoreDocs){Document doc=is.doc(scoreDoc.doc);System.out.println(doc.get("id"));System.out.println(doc.get("city"));System.out.println(doc.get("desc"));}}@TestpublicvoidtestBooleanQuery()throwsException{IndexReader reader =DirectoryReader.open(dir);IndexSearcher is =newIndexSearcher(reader);NumericRangeQuery<Integer> query1=NumericRangeQuery.newIntRange("id",1,2,true,true);PrefixQuery query2=newPrefixQuery(newTerm("city","s"));BooleanQuery.Builder booleanQuery=newBooleanQuery.Builder();
        booleanQuery.add(query1,BooleanClause.Occur.MUST);
        booleanQuery.add(query2,BooleanClause.Occur.MUST);TopDocs hits=is.search(booleanQuery.build(),10);for(ScoreDoc scoreDoc:hits.scoreDocs){Document doc=is.doc(scoreDoc.doc);System.out.println(doc.get("id"));System.out.println(doc.get("city"));System.out.println(doc.get("desc"));}}}

本文转载自: https://blog.csdn.net/weixin_63719049/article/details/126431175
版权归原作者 追梦梓辰 所有, 如有侵权,请联系我们删除。

“Lucene全文检索”的评论:

还没有评论