熱點推薦:
您现在的位置: 電腦知識網 >> 編程 >> Java編程 >> Java開源技術 >> 正文

如何使用Lucene對html文件進行索引

2013-11-23 20:24:30  來源: Java開源技術 

  我修改了lucene的demo包的IndexHTML類使其可以被其他Java類調用
  
  IndexHTML類
  
  import orgapacheluceneanalysisstandardStandardAnalyzer;
  
  import orgapachelucenedocumentDocument;
  
  import orgapacheluceneindexIndexReader;
  
  import orgapacheluceneindexIndexWriter;
  
  import orgapacheluceneindexTerm;
  
  import orgapacheluceneindexTermEnum;
  
  import javaioFile;import javautilDate;
  
  import javautilArrays;
  
  //還需調用demo的其他類
  
  import orgapachelucenedemo;
  
  /**
  
  * Create html file index for searching
  
  * @author tyrone
  
  *
  
  */public class IndexHTML { private String DocsPath=null;
  
  /**
  
  * the path for index file;
  
  */ private String IndexFilePath=null;
  
  /**
  
  * true during deletion pass
  
  */  private boolean deleting = false;
  
  /**
  
  * existing index
  
  */  private IndexReader reader;
  
  /**
  
  * new index being built
  
  */  private IndexWriter writer;
  
  /**
  
  * document id iterator
  
  */  private TermEnum uidIter;
  
  private void indexDocs(File file)throws Exception {
  
  if (fileisDirectory())
  
  {
  
  // if a directory  String[] files = filelist();
  
  // list its files  Arrayssort(files);
  
  // sort the files  for (int i = ; i < fileslength;
  
  i++)  // recursively index them  thisindexDocs(new File(file files[i]));
  
  } else if (filegetPath()endsWith(l) || // l files  filegetPath()endsWith() || // files  filegetPath()endsWith(txt)) { // index txt files   if (thisuidIter != null) {  String uid = HTMLDocumentuid(file);
  
  // construct uid for doc
  
  while (uidIterterm() != null && uidIterterm()field() == uid &&
  
  uidIterterm()text(pareTo(uid) <) {
  
  if (deleting) {
  
  // delete stale docs
  
  Systemoutprintln(deleting +
  
  HTMLDocumentuidurl(uidIterterm()text()));
  
  readerdelete(uidIterterm());
  
  }
  
  uidIternext();
  
  }
  
  if (uidIterterm() != null && uidIterterm()field() == uid &&
  
  uidIterterm()text(pareTo(uid) == ) {
  
  uidIternext();
  
  // keep matching docs
  
  } else if (!deleting) {
  
  // add new docs
  
  Document doc = HTMLDocumentDocument(file);
  
  Systemoutprintln(adding + docget(url));
  
  writeraddDocument(doc);
  
  }
  
  } else { // creating a new index
  
  Document doc = HTMLDocumentDocument(file);
  
  Systemoutprintln(adding + docget(url));
  
  writeraddDocument(doc);
  
  // add docs unconditionally
  
  }
  
  } return;
  
  }
  
  /**
  
  * Walk directory hierarchy in uid order while keeping uid iterator from
  
  * existing index in sync Mismatches indicate one of:
  
  * (a) old documents to be deleted;
  
  * (b) unchanged documents to be left alone;
  
  * or (c) new documents to be indexed
  
  */  private void indexDocs(File file String index boolean create)
  
  throws Exception {
  
  if (!create) {
  
  // incrementally update
  
  reader = IndexReaderopen(index);
  
  // open existing index
  
  uidIter = readerterms(new Term(uid ));
  
  // init uid iterator
  
  thisindexDocs(file);
  
  if (deleting) {
  
  // delete rest of stale docs
  
  while (uidIterterm() != null && uidIterterm()field() == uid) {
  
  Systemoutprintln(deleting +
  
  HTMLDocumentuidurl(uidIterterm()text()));
  
  readerdelete(uidIterterm());
  
  uidIternext();
  
  }
  
  deleting = false;
  
  }
  
  uidIterclose();
  
  // close uid iterator
  
  readerclose();
  
  // close existing index
  
  } else
  
  // dont have exisiting
  
  thisindexDocs(file);
  
  }
  
  /**
  
  * if create=true create a new index else refresh old index
  
  * @param create
  
  */ public void run(boolean create)
  
  {
  
  try {
  
  String index = index;
  
  File root = null;
  
  if (thisIndexFilePath!=null)
  
  {
  
  // index file path
  
  index = thisIndexFilePath;
  
  }
  
  if (thisDocsPath==null){
  
  Systemoutprintln(root directory is not set);
  
  return;
  
  }
  
  root = new File(thisDocsPath);
  
  Date start = new Date();
  
  /**
  
  * not create then maintenance
  
  */
  
  if (!create) {
  
  // delete stale docs
  
  thisdeleting = true;
  
  thisindexDocs(root index create);
  
  }
  
  writer = new IndexWriter(index new StandardAnalyzer() create);
  
  writermaxFieldLength = ;
  
  thisindexDocs(root index create);
  
  // add new docs
  
  Systemoutprintln(Optimizing index);
  
  writeroptimize();
  
  writerclose();
  
  Date end = new Date();
  
  Systemoutprint(endgetTime() startgetTime());
  
  Systemoutprintln( total milliseconds);
  
  } catch (Exception e) {
  
  Systemoutprintln( caught a + egetClass() +
  
  \n with message: + egetMessage());
  
  }
  
  return;
  
  }
  
  /**
  
  * @return Returns the IndexFilePath
  
  */ public String getIndexFilePath() { return IndexFilePath;
  
  }
  
  /**
  
  * @param IndexFilePath The IndexFilePath to set
  
  */ public void setIndexFilePath(String property) { thisIndexFilePath = property;
  
  }
  
  /**
  
  * @return Returns the DocsPath
  
  */ public String getDocsPath() { return DocsPath;
  
  }
  
  /**
  
  * @param DocsPath The DocsPath to set
  
  */ public void setDocsPath(String property) { thisDocsPath = property;
  
  }
  
  /**
  
  * test
  
  * @param args
  
  */ public static void main(String[] args){ IndexHTML ih=new IndexHTML();
  
  ihsetDocsPath(D:\\MyProject\\colimas\\clmsdoc\\html);
  
  ihsetIndexFilePath(D:\\MyProject\\colimas\\index); ihrun(true); }}
  
  運行後生成個文件_icfsdeletablesegments
  
  搜索文件類
  
  /*
  
  * Created on //
  
  *
  
  * TODO To change the template for this generated file go to
  
  * Window Preferences Java Code Style Code Templates
  
  */package limassearchquery;
  
  /** * @author tyrone * * TODO To change the template for this generated type comment go to
  
  * Window Preferences Java Code Style Code Templates
  
  */public class HitsHTMLDoc { private String Title;
  
  priva
From:http://tw.wingwit.com/Article/program/Java/ky/201311/28449.html
    推薦文章
    Copyright © 2005-2013 電腦知識網 Computer Knowledge   All rights reserved.