Lucene全文检索
Lucene 是一个基于 Java 的全文信息检索工具包,目前主流的搜索系统 Elasticsearch 和 solr 都是基于 lucene 的索引和搜索能力进行。
Solr与Lucene的区别:
Solr和Lucene的本质区别三点:搜索服务器,企业级和管理。
Lucene本质上是搜索库,不是独立的应用程序,而Solr是。
Lucene专注于搜索底层的建设,而Solr专注与企业应用。
Lucene不负责支撑搜索服务所必须的管理,而Slor负责
所以说,一句话概括solr是Lucene面向企业搜索应用的扩展,如果Lucene数据量超过10万就会有点力不从心了
ES:
ES是对apache lucene的封装。
ES是elasticSearch的缩写,它是一个实时的分布式的查询和分析引擎。它是基于apache lucene开发的。
2:ES的目标是让全文搜索变得简单
3:ES可以支持横向的扩展,支持pb级别的结构和非机构化的数据处理。
4:使用ES可以以前所未有的速度来处理大数据。
今天的主题是Lucene:
Lucene生成索引:
packagecom.zking.test.lucene;/**
* 生成索引测试
* @author Administrator
*
*/publicclassDemo1{publicstaticvoidmain(String[] args){// 索引文件将要存放的位置String indexDir ="E:\\temp\\test\\lucene\\demo1";// 数据源地址String dataDir ="E:\\temp\\test\\lucene\\demo1\\data";IndexCreate ic =null;try{
ic =newIndexCreate(indexDir);long start =System.currentTimeMillis();int num = ic.index(dataDir);long end =System.currentTimeMillis();System.out.println("检索指定路径下"+num+"个文件,一共花费了"+(end-start)+"毫秒");}catch(Exception e){
e.printStackTrace();}finally{try{
ic.closeIndexWriter();}catch(Exception e){
e.printStackTrace();}}}}
配合Demo1的实现:
packagecom.zking.test.lucene;importjava.io.File;importjava.io.FileReader;importjava.nio.file.Paths;importorg.apache.lucene.analysis.Analyzer;importorg.apache.lucene.analysis.standard.StandardAnalyzer;importorg.apache.lucene.document.Document;importorg.apache.lucene.document.Field;importorg.apache.lucene.document.TextField;importorg.apache.lucene.index.IndexWriter;importorg.apache.lucene.index.IndexWriterConfig;importorg.apache.lucene.store.FSDirectory;/**
* 配合Demo1.java进行lucene的helloword实现
* @author Administrator
*
*/publicclassIndexCreate{privateIndexWriter indexWriter;/**
* 1、构造方法 实例化IndexWriter
* @param indexDir 索引文件存放的地址
* @throws Exception
*/publicIndexCreate(String indexDir)throwsException{// 获取索引文件的存放地址对象FSDirectory dir =FSDirectory.open(Paths.get(indexDir));// 标准分词器(针对英文)Analyzer analyzer =newStandardAnalyzer();// 索引输出流配置对象IndexWriterConfig conf =newIndexWriterConfig(analyzer);
indexWriter =newIndexWriter(dir, conf);}/**
* 2、关闭索引输出流
* @throws Exception
*/publicvoidcloseIndexWriter()throwsException{
indexWriter.close();}/**
* 3、索引指定路径下的所有文件
* @param dataDir 数据源
* @return
* @throws Exception
*/publicintindex(String dataDir)throwsException{File[] files =newFile(dataDir).listFiles();for(File file : files){indexFile(file);}return indexWriter.numDocs();}/**
* 4、索引指定的文件
* @param file
* @throws Exception
*/privatevoidindexFile(File file)throwsException{System.out.println("被索引文件的全路径:"+file.getCanonicalPath());Document doc =getDocument(file);
indexWriter.addDocument(doc);}/**
* 5、获取文档(索引文件中包含的重要信息,key-value的形式)
* @param file
* @return
* @throws Exception
*/privateDocumentgetDocument(File file)throwsException{Document doc =newDocument();
doc.add(newTextField("contents",newFileReader(file)));// Field.Store.YES是否存储到硬盘
doc.add(newTextField("fullPath", file.getCanonicalPath(),Field.Store.YES));
doc.add(newTextField("fileName", file.getName(),Field.Store.YES));return doc;}}
Lucene查询索引:
packagecom.zking.test.lucene;/**
* 查询索引测试
* @author Administrator
*
*/publicclassDemo2{publicstaticvoidmain(String[] args){String indexDir ="E:\\temp\\test\\lucene\\demo1";String q ="EarlyTerminating-Collector";try{IndexUse.search(indexDir, q);}catch(Exception e){
e.printStackTrace();}}}
配合Demo2的实现:
packagecom.zking.test.lucene;importjava.nio.file.Paths;importorg.apache.lucene.analysis.Analyzer;importorg.apache.lucene.analysis.standard.StandardAnalyzer;importorg.apache.lucene.document.Document;importorg.apache.lucene.index.DirectoryReader;importorg.apache.lucene.index.IndexReader;importorg.apache.lucene.queryparser.classic.QueryParser;importorg.apache.lucene.search.IndexSearcher;importorg.apache.lucene.search.Query;importorg.apache.lucene.search.ScoreDoc;importorg.apache.lucene.search.TopDocs;importorg.apache.lucene.store.FSDirectory;/**
* 配合Demo2.java进行lucene的helloword实现
* @author Administrator
*
*/publicclassIndexUse{/**
* 通过关键字在索引目录中查询
* @param indexDir 索引文件所在目录
* @param q 关键字
*/publicstaticvoidsearch(String indexDir,String q)throwsException{FSDirectory indexDirectory =FSDirectory.open(Paths.get(indexDir));// 注意:索引输入流不是new出来的,是通过目录读取工具类打开的IndexReader indexReader =DirectoryReader.open(indexDirectory);// 获取索引搜索对象IndexSearcher indexSearcher =newIndexSearcher(indexReader);Analyzer analyzer =newStandardAnalyzer();QueryParser queryParser =newQueryParser("contents", analyzer);// 获取符合关键字的查询对象Query query = queryParser.parse(q);long start=System.currentTimeMillis();// 获取关键字出现的前十次TopDocs topDocs = indexSearcher.search(query ,10);long end=System.currentTimeMillis();System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+topDocs.totalHits+"个记录");for(ScoreDoc scoreDoc : topDocs.scoreDocs){int docID = scoreDoc.doc;// 索引搜索对象通过文档下标获取文档Document doc = indexSearcher.doc(docID);System.out.println("通过索引文件:"+doc.get("fullPath")+"拿数据");}
indexReader.close();}}
对索引的增删改:
packagecom.zking.test.lucene;importjava.nio.file.Paths;importorg.apache.lucene.analysis.Analyzer;importorg.apache.lucene.analysis.standard.StandardAnalyzer;importorg.apache.lucene.document.Document;importorg.apache.lucene.document.Field;importorg.apache.lucene.document.StringField;importorg.apache.lucene.document.TextField;importorg.apache.lucene.index.IndexWriter;importorg.apache.lucene.index.IndexWriterConfig;importorg.apache.lucene.index.Term;importorg.apache.lucene.store.FSDirectory;importorg.junit.Before;importorg.junit.Test;/**
* 构建索引
* 对索引的增删改
* @author Administrator
*
*/publicclassDemo3{privateString ids[]={"1","2","3"};privateString citys[]={"qingdao","nanjing","shanghai"};privateString descs[]={"Qingdao is a beautiful city.","Nanjing is a city of culture.","Shanghai is a bustling city."};privateFSDirectory dir;/**
* 每次都生成索引文件
* @throws Exception
*/@BeforepublicvoidsetUp()throwsException{
dir =FSDirectory.open(Paths.get("E:\\temp\\test\\lucene\\demo2\\indexDir"));IndexWriter indexWriter =getIndexWriter();for(int i =0; i < ids.length; i++){Document doc =newDocument();
doc.add(newStringField("id", ids[i],Field.Store.YES));
doc.add(newStringField("city", citys[i],Field.Store.YES));
doc.add(newTextField("desc", descs[i],Field.Store.NO));
indexWriter.addDocument(doc);}
indexWriter.close();}/**
* 获取索引输出流
* @return
* @throws Exception
*/privateIndexWritergetIndexWriter()throwsException{Analyzer analyzer =newStandardAnalyzer();IndexWriterConfig conf =newIndexWriterConfig(analyzer);returnnewIndexWriter(dir, conf );}/**
* 测试写了几个索引文件
* @throws Exception
*/@TestpublicvoidgetWriteDocNum()throwsException{IndexWriter indexWriter =getIndexWriter();System.out.println("索引目录下生成"+indexWriter.numDocs()+"个索引文件");}/**
* 打上标记,该索引实际并未删除
* @throws Exception
*/@TestpublicvoiddeleteDocBeforeMerge()throwsException{IndexWriter indexWriter =getIndexWriter();System.out.println("最大文档数:"+indexWriter.maxDoc());
indexWriter.deleteDocuments(newTerm("id","1"));
indexWriter.commit();System.out.println("最大文档数:"+indexWriter.maxDoc());System.out.println("实际文档数:"+indexWriter.numDocs());
indexWriter.close();}/**
* 对应索引文件已经删除,但是该版本的分词会保留
* @throws Exception
*/@TestpublicvoiddeleteDocAfterMerge()throwsException{// https://blog.csdn.net/asdfsadfasdfsa/article/details/78820030// org.apache.lucene.store.LockObtainFailedException: Lock held by this virtual machine:indexWriter是单例的、线程安全的,不允许打开多个。IndexWriter indexWriter =getIndexWriter();System.out.println("最大文档数:"+indexWriter.maxDoc());
indexWriter.deleteDocuments(newTerm("id","1"));
indexWriter.forceMergeDeletes();//强制删除
indexWriter.commit();System.out.println("最大文档数:"+indexWriter.maxDoc());System.out.println("实际文档数:"+indexWriter.numDocs());
indexWriter.close();}/**
* 测试更新索引
* @throws Exception
*/@TestpublicvoidtestUpdate()throwsException{IndexWriter writer=getIndexWriter();Document doc=newDocument();
doc.add(newStringField("id","1",Field.Store.YES));
doc.add(newStringField("city","qingdao",Field.Store.YES));
doc.add(newTextField("desc","dsss is a city.",Field.Store.NO));
writer.updateDocument(newTerm("id","1"), doc);
writer.close();}}
文档域加权:
packagecom.zking.test.lucene;importjava.nio.file.Paths;importorg.apache.lucene.analysis.Analyzer;importorg.apache.lucene.analysis.standard.StandardAnalyzer;importorg.apache.lucene.document.Document;importorg.apache.lucene.document.Field;importorg.apache.lucene.document.StringField;importorg.apache.lucene.document.TextField;importorg.apache.lucene.index.DirectoryReader;importorg.apache.lucene.index.IndexReader;importorg.apache.lucene.index.IndexWriter;importorg.apache.lucene.index.IndexWriterConfig;importorg.apache.lucene.index.Term;importorg.apache.lucene.search.IndexSearcher;importorg.apache.lucene.search.Query;importorg.apache.lucene.search.ScoreDoc;importorg.apache.lucene.search.TermQuery;importorg.apache.lucene.search.TopDocs;importorg.apache.lucene.store.Directory;importorg.apache.lucene.store.FSDirectory;importorg.junit.Before;importorg.junit.Test;/**
* 文档域加权
* @author Administrator
*
*/publicclassDemo4{privateString ids[]={"1","2","3","4"};privateString authors[]={"Jack","Marry","John","Json"};privateString positions[]={"accounting","technician","salesperson","boss"};privateString titles[]={"Java is a good language.","Java is a cross platform language","Java powerful","You should learn java"};privateString contents[]={"If possible, use the same JRE major version at both index and search time.","When upgrading to a different JRE major version, consider re-indexing. ","Different JRE major versions may implement different versions of Unicode,","For example: with Java 1.4, `LetterTokenizer` will split around the character U+02C6,"};privateDirectory dir;//索引文件目录@BeforepublicvoidsetUp()throwsException{
dir =FSDirectory.open(Paths.get("E:\\temp\\test\\lucene\\demo3\\indexDir"));IndexWriter writer =getIndexWriter();for(int i =0; i < authors.length; i++){Document doc =newDocument();
doc.add(newStringField("id", ids[i],Field.Store.YES));
doc.add(newStringField("author", authors[i],Field.Store.YES));
doc.add(newStringField("position", positions[i],Field.Store.YES));TextField textField =newTextField("title", titles[i],Field.Store.YES);// Json投钱做广告,把排名刷到第一了if("boss".equals(positions[i])){
textField.setBoost(2f);//设置权重,默认为1}
doc.add(textField);// TextField会分词,StringField不会分词
doc.add(newTextField("content", contents[i],Field.Store.NO));
writer.addDocument(doc);}
writer.close();}privateIndexWritergetIndexWriter()throwsException{Analyzer analyzer =newStandardAnalyzer();IndexWriterConfig conf =newIndexWriterConfig(analyzer);returnnewIndexWriter(dir, conf);}@Testpublicvoidindex()throwsException{IndexReader reader =DirectoryReader.open(dir);IndexSearcher searcher =newIndexSearcher(reader);String fieldName ="title";String keyWord ="java";Term t =newTerm(fieldName, keyWord);Query query =newTermQuery(t);TopDocs hits = searcher.search(query,10);System.out.println("关键字:‘"+keyWord+"’命中了"+hits.totalHits+"次");for(ScoreDoc scoreDoc : hits.scoreDocs){Document doc = searcher.doc(scoreDoc.doc);System.out.println(doc.get("author"));}}}
特定项搜索和查询表达式(queryParser):
packagecom.zking.test.lucene;importjava.io.IOException;importjava.nio.file.Paths;importorg.apache.lucene.analysis.standard.StandardAnalyzer;importorg.apache.lucene.document.Document;importorg.apache.lucene.index.DirectoryReader;importorg.apache.lucene.index.IndexReader;importorg.apache.lucene.index.Term;importorg.apache.lucene.queryparser.classic.ParseException;importorg.apache.lucene.queryparser.classic.QueryParser;importorg.apache.lucene.search.IndexSearcher;importorg.apache.lucene.search.NumericRangeQuery;importorg.apache.lucene.search.ScoreDoc;importorg.apache.lucene.search.TermQuery;importorg.apache.lucene.search.TopDocs;importorg.apache.lucene.store.FSDirectory;importorg.junit.Before;importorg.junit.Test;/**
* 特定项搜索
* 查询表达式(queryParser)
* @author Administrator
*
*/publicclassDemo5{@BeforepublicvoidsetUp(){// 索引文件将要存放的位置String indexDir ="E:\\temp\\test\\lucene\\demo4";// 数据源地址String dataDir ="E:\\temp\\test\\lucene\\demo4\\data";IndexCreate ic =null;try{
ic =newIndexCreate(indexDir);long start =System.currentTimeMillis();int num = ic.index(dataDir);long end =System.currentTimeMillis();System.out.println("检索指定路径下"+ num +"个文件,一共花费了"+(end - start)+"毫秒");}catch(Exception e){
e.printStackTrace();}finally{try{
ic.closeIndexWriter();}catch(Exception e){
e.printStackTrace();}}}/**
* 特定项搜索
*/@TestpublicvoidtestTermQuery(){String indexDir ="E:\\temp\\test\\lucene\\demo4";String fld ="contents";String text ="indexformattoooldexception";// 特定项片段名和关键字Term t =newTerm(fld , text);TermQuery tq =newTermQuery(t );try{FSDirectory indexDirectory =FSDirectory.open(Paths.get(indexDir));// 注意:索引输入流不是new出来的,是通过目录读取工具类打开的IndexReader indexReader =DirectoryReader.open(indexDirectory);// 获取索引搜索对象IndexSearcher is =newIndexSearcher(indexReader);TopDocs hits = is.search(tq,100);// System.out.println(hits.totalHits);for(ScoreDoc scoreDoc: hits.scoreDocs){Document doc = is.doc(scoreDoc.doc);System.out.println("文件"+doc.get("fullPath")+"中含有该关键字");}}catch(IOException e){
e.printStackTrace();}}/**
* 查询表达式(queryParser)
*/@TestpublicvoidtestQueryParser(){String indexDir ="E:\\temp\\test\\lucene\\demo4";// 获取查询解析器(通过哪种分词器去解析哪种片段)QueryParser queryParser =newQueryParser("contents",newStandardAnalyzer());try{FSDirectory indexDirectory =FSDirectory.open(Paths.get(indexDir));// 注意:索引输入流不是new出来的,是通过目录读取工具类打开的IndexReader indexReader =DirectoryReader.open(indexDirectory);// 获取索引搜索对象IndexSearcher is =newIndexSearcher(indexReader);// 由解析器去解析对应的关键字TopDocs hits = is.search(queryParser.parse("indexformattoooldexception"),100);for(ScoreDoc scoreDoc: hits.scoreDocs){Document doc = is.doc(scoreDoc.doc);System.out.println("文件"+doc.get("fullPath")+"中含有该关键字");}}catch(IOException e){
e.printStackTrace();}catch(ParseException e){// TODO Auto-generated catch block
e.printStackTrace();}}}
指定数字范围查询和指定字符串开头字母查询(prefixQuery):
packagecom.zking.test.lucene;importjava.nio.file.Paths;importorg.apache.lucene.analysis.Analyzer;importorg.apache.lucene.analysis.standard.StandardAnalyzer;importorg.apache.lucene.document.Document;importorg.apache.lucene.document.Field;importorg.apache.lucene.document.IntField;importorg.apache.lucene.document.StringField;importorg.apache.lucene.document.TextField;importorg.apache.lucene.index.DirectoryReader;importorg.apache.lucene.index.IndexReader;importorg.apache.lucene.index.IndexWriter;importorg.apache.lucene.index.IndexWriterConfig;importorg.apache.lucene.index.Term;importorg.apache.lucene.search.BooleanClause;importorg.apache.lucene.search.BooleanQuery;importorg.apache.lucene.search.IndexSearcher;importorg.apache.lucene.search.NumericRangeQuery;importorg.apache.lucene.search.PrefixQuery;importorg.apache.lucene.search.ScoreDoc;importorg.apache.lucene.search.TopDocs;importorg.apache.lucene.store.FSDirectory;importorg.junit.Before;importorg.junit.Test;/**
* 指定数字范围查询
* 指定字符串开头字母查询(prefixQuery)
* @author Administrator
*
*/publicclassDemo6{privateint ids[]={1,2,3};privateString citys[]={"qingdao","nanjing","shanghai"};privateString descs[]={"Qingdao is a beautiful city.","Nanjing is a city of culture.","Shanghai is a bustling city."};privateFSDirectory dir;/**
* 每次都生成索引文件
* @throws Exception
*/@BeforepublicvoidsetUp()throwsException{
dir =FSDirectory.open(Paths.get("E:\\temp\\test\\lucene\\demo2\\indexDir"));IndexWriter indexWriter =getIndexWriter();for(int i =0; i < ids.length; i++){Document doc =newDocument();
doc.add(newIntField("id", ids[i],Field.Store.YES));
doc.add(newStringField("city", citys[i],Field.Store.YES));
doc.add(newTextField("desc", descs[i],Field.Store.NO));
indexWriter.addDocument(doc);}
indexWriter.close();}/**
* 获取索引输出流
* @return
* @throws Exception
*/privateIndexWritergetIndexWriter()throwsException{Analyzer analyzer =newStandardAnalyzer();IndexWriterConfig conf =newIndexWriterConfig(analyzer);returnnewIndexWriter(dir, conf );}/**
* 指定数字范围查询
* @throws Exception
*/@TestpublicvoidtestNumericRangeQuery()throwsException{IndexReader reader =DirectoryReader.open(dir);IndexSearcher is =newIndexSearcher(reader);NumericRangeQuery<Integer> query=NumericRangeQuery.newIntRange("id",1,2,true,true);TopDocs hits=is.search(query,10);for(ScoreDoc scoreDoc:hits.scoreDocs){Document doc=is.doc(scoreDoc.doc);System.out.println(doc.get("id"));System.out.println(doc.get("city"));System.out.println(doc.get("desc"));}}/**
* 指定字符串开头字母查询(prefixQuery)
* @throws Exception
*/@TestpublicvoidtestPrefixQuery()throwsException{IndexReader reader =DirectoryReader.open(dir);IndexSearcher is =newIndexSearcher(reader);PrefixQuery query=newPrefixQuery(newTerm("city","n"));TopDocs hits=is.search(query,10);for(ScoreDoc scoreDoc:hits.scoreDocs){Document doc=is.doc(scoreDoc.doc);System.out.println(doc.get("id"));System.out.println(doc.get("city"));System.out.println(doc.get("desc"));}}@TestpublicvoidtestBooleanQuery()throwsException{IndexReader reader =DirectoryReader.open(dir);IndexSearcher is =newIndexSearcher(reader);NumericRangeQuery<Integer> query1=NumericRangeQuery.newIntRange("id",1,2,true,true);PrefixQuery query2=newPrefixQuery(newTerm("city","s"));BooleanQuery.Builder booleanQuery=newBooleanQuery.Builder();
booleanQuery.add(query1,BooleanClause.Occur.MUST);
booleanQuery.add(query2,BooleanClause.Occur.MUST);TopDocs hits=is.search(booleanQuery.build(),10);for(ScoreDoc scoreDoc:hits.scoreDocs){Document doc=is.doc(scoreDoc.doc);System.out.println(doc.get("id"));System.out.println(doc.get("city"));System.out.println(doc.get("desc"));}}}
版权归原作者 追梦梓辰 所有, 如有侵权,请联系我们删除。