package step1; import java.io.File; import java.io.IOException; import org.apache.commons.io.FileUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.LongField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class WriterIndex { //创建索引库 public static void createIndex() throws IOException{ //创建索引库 Directory dir = FSDirectory.open(new File("/temp/doc/1101/index")); //创建标准分析器 Analyzer analyzer = new StandardAnalyzer(); //创建indexwriterConfig对象 //第一个参数:lucene的版本信息,可以选择对应的lucene版本也可以使用LATEST //第二个参数:分析器对象 IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer); //创建indexwriter对象 IndexWriter index = new IndexWriter(dir,config); //原始文档的路径 File resource = new File("source/searchsource"); for (File f : resource.listFiles()) { //文件名 String fileName = f.getName(); //文件内容 String fileContent = FileUtils.readFileToString(f); //文件路径 String filePath = f.getPath(); //文件大小 long fileSize = FileUtils.sizeOf(f); //创建文件名域 //第一个参数:域的名称 //第二个参数:域的内容 //第三个参数:是否存储 Field fileNameField = new TextField("filename", fileName, Store.YES); //文件内容域 Field fileContentField = new TextField("content", fileContent, Store.YES); //文件路径域(不分析、不索引、只存储) Field filePathField = new StoredField("path", filePath); //文件大小域 Field fileSizeField = new LongField("size", fileSize, Store.YES); //创建document对象 document document = new document(); //添加field document.add(fileNameField); document.add(fileContentField); document.add(filePathField); document.add(fileSizeField); index.adddocument(document); } //关闭indexwriter index.close(); } }第2关:查询索引
package step2; import java.io.File; import java.io.IOException; import javax.sql.rowset.serial.SerialArray; import org.apache.lucene.document.document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class SearchIndex { public static void searchIndex() throws IOException{ Directory directory = FSDirectory.open(new File("/temp/doc/1101/index")); IndexReader reader = DirectoryReader.open(directory); //创建indexsearcher对象 IndexSearcher searcher = new IndexSearcher(reader); //创建查询 Query query = new TermQuery(new Term("content","mybatis")); //执行查询 //第一个参数是查询对象,第二个参数是查询结果返回的最大值 TopDocs topDocs = searcher.search(query, 10); //查询结束的总条数 System.out.println("查询结果的总条数:" + topDocs.totalHits); //遍历查询结果 //topDocs.scoreDocs 存储了document对象的id ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scoreDoc : scoreDocs) { //sourceDoc.doc属性就是document对象的Id //根据document的id找到document对象的id //根据id获取document对象 document document = searcher.doc(scoreDoc.doc); //System.out.println(document.get("filename")); System.out.println(document.get("path")); System.out.println(document.get("content")); } reader.close(); } }第3关:分词器的使用
package step3; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; public class AnalyzerTest { public static void main(String[] args) throws IOException { //创建一个标准分析器对象 Analyzer analyzer = new CJKAnalyzer(); //获得tokenStream对象 //第一个参数:域名,可以随便给一个 //第二个参数:要分析的文本内容 TokenStream tokenStream = analyzer.tokenStream("test", "我喜欢在Educoder上学习"); //添加一个引用,可以获得每个关键词 CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); //添加一个偏移量的引用,记录了关键词的开始位置以及结束位置 OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); //将指针调整到列表的头部 tokenStream.reset(); //遍历关键词列表,通过incrementToken方法判断列表是否结束 while(tokenStream.incrementToken()) { //关键词的起始位置 System.out.println("start->" + offsetAttribute.startOffset()); //取关键词 System.out.println(charTermAttribute); //结束位置 System.out.println("end->" + offsetAttribute.endOffset()); } tokenStream.close(); } }
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)