熱點推薦:
您现在的位置: 電腦知識網 >> 編程 >> Java編程 >> Java核心技術 >> 正文

使用java將網頁保存為mht格式

2013-11-23 19:23:54  來源: Java核心技術 

  package comtag;

  import javaioBufferedInputStream;

  import javaioBufferedOutputStream;

  import javaioBufferedReader;

  import javaioByteArrayInputStream;

  import javaioDataOutputStream;

  import javaioFile;

  import javaioFileInputStream;

  import javaioFileOutputStream;

  import javaioFileWriter;

  import javaioIOException;

  import javaioInputStream;

  import javaioInputStreamReader;

  import javaioOutputStream;

  import javaioReader;

  import MalformedURLException;

  import URL;

  import javautil*;

  import lparserParser;

  import lparserTag;

  import lparserfiltersTagNameFilter;

  import lparserlexerLexer;

  import lparserlexerPage;

  import lparserutilDefaultParserFeedback;

  import lparserutilNodeList;

  import lparserutilParserException;

  import toptracktoolsJQuery;

  import javaxactivationDataHandler;

  import javaxactivationDataSource;

  import javaxactivationMimetypesFileTypeMap;

  import javaxmailMessage;

  import javaxmailMessagingException;

  import javaxmailMultipart;

  import javaxmailSession;

  import javaxmailinternetInternetAddress;

  import javaxmailinternetMimeBodyPart;

  import javaxmailinternetMimeMessage;

  import javaxmailinternetMimeMultipart;

  import javaxmailinternetMimePartDataSource;

  /**

  * mht文件解析類

  * @author dl

  */

  public class HtmlMHTCompiler {

  private URL strWeb = null; /**網頁地址*/

  private String strText = null; /**網頁文本內容*/

  private String strFileName = null; /**本地文件名*/

  private String strEncoding = null; /**網頁編碼*/

  //mht格式附加信息

  private String from = ;

  private String to;

  private String subject = mht compile;

  private String cc;

  private String bcc;

  private String smtp = localhost;

  public static void main(String[] args) {

  String strUrl = ;

  String strEncoding = utf;

  String strText = JQuerygetHtmlText(strUrl strEncoding null);

  if (strText == null)

  return;

  HtmlMHTCompiler ht = new HtmlMHTCompiler(strText strUrl strEncoding testmht);

  pile();

  //HtmlMHTCompilermhthtml(testmht l);

  }

  /**

  *<br>方法說明初始化

  *<br>輸入參數strText 網頁文本內容; strUrl 網頁地址; strEncoding 網頁編碼; strFileName 本地文件名

  *<br>返回類型

  */

  public HtmlMHTCompiler(String strText String strUrl String strEncoding String strFileName) {

  // TODO Autogenerated constructor stub

  try {

  strWeb = new URL(strUrl);

  } catch (MalformedURLException e) {

  // TODO Autogenerated catch block

  eprintStackTrace();

  return;

  }

  thisstrText = strText;

  thisstrEncoding = strEncoding;

  thisstrFileName = strFileName;

  }

  /**

  *<br>方法說明執行下載操作

  *<br>輸入參數

  *<br>返回類型

  */

  public boolean compile() {

  if (strWeb == null || strText == null || strFileName == null || strEncoding == null)

  return false;

  HashMap urlMap = new HashMap();

  NodeList nodes = new NodeList();

  try {

  Parser parser = createParser(strText);

  parsersetEncoding(strEncoding);

  nodes = parserparse(null);

  } catch (ParserException e) {

  // TODO Autogenerated catch block

  eprintStackTrace();

  }

  extractAllScriptNodes(nodes);

  ArrayList urlScriptList = extractAllScriptNodes(nodes urlMap);

  ArrayList urlImageList = extractAllImageNodes(nodes urlMap);

  for (Iterator iter = urlMapentrySet(erator(); iterhasNext();) {

  MapEntry entry = (MapEntry) iternext();

  String key = (String)entrygetKey();

  String val = (String)entrygetValue();

  strText = JHtmlClearreplace(strText val key);

  }

  try {

  createMhtArchive(strText urlScriptList urlImageList);

  } catch (Exception e) {

  // TODO Autogenerated catch block

  eprintStackTrace();

  return false;

  }

  return true;

  }

  /**

  *<br>方法說明建立HTML parser

  *<br>輸入參數inputHTML 網頁文本內容

  *<br>返回類型HTML parser

  */

  private Parser createParser(String inputHTML) {

  // TODO Autogenerated method stub

  Lexer mLexer = new Lexer(new Page(inputHTML));

  return new Parser(mLexer new DefaultParserFeedback(DefaultParserFeedbackQUIET));

  }

  /**

  *<br>方法說明抽取基礎URL地址

  *<br>輸入參數nodes 網頁標簽集合

  *<br>返回類型

  */

  private void extractAllScriptNodes(NodeList nodes) {

  NodeList filtered = nodesextractAllNodesThatMatch(new TagNameFilter(

  BASE) true);

  if (filtered != null && filteredsize() > ) {

  Tag tag = (Tag) filteredelementAt();

  String href = taggetAttribute(href);

  if (href != null && hreflength() > ) {

  try {

  strWeb = new URL(href);

  } catch (MalformedURLException e) {

  // TODO Autogenerated catch block

  eprintStackTrace();

  }

  }

  }

  }

  /**

  *<br>方法說明抽取網頁包含的cssjs鏈接

  *<br>輸入參數nodes 網頁標簽集合; urlMap 已存在的url集合

  *<br>返回類型cssjs鏈接的集合

  */

  private ArrayList extractAllScriptNodes(NodeList nodes HashMap urlMap) {

  ArrayList urlList = new ArrayList();

  NodeList filtered = nodesextractAllNodesThatMatch(new TagNameFilter(script) true);

  for (int i = ; i < filteredsize(); i++) {

  Tag tag = (Tag) filteredelementAt(i);

  String src = taggetAttribute(src);

  // Handle external css files url

  if (src != null && srclength() > ) {

  String innerURL = src;

  String absoluteURL = makeAbsoluteURL(strWeb innerURL);

  if (absoluteURL != null && !ntainsKey(absoluteURL)) {

  urlMapput(absoluteURL innerURL);

  ArrayList urlInfo = new ArrayList();

  urlInfoadd(innerURL);

  urlInfoadd(absoluteURL);

  urlListadd(urlInfo);

  }

  tagsetAttribute(src absoluteURL);

  }

  }

  filtered = nodesextractAllNodesThatMatch(new TagNameFilter(link) true);

  for (int i = ; i < filteredsize(); i++) {

  Tag tag = (Tag) filteredelementAt(i);

  String type = (taggetAttribute(type));

  String rel = (taggetAttribute(rel));

  String href = taggetAttribute(href);

  boolean isCssFile = false;

  if (rel != null) {

  isCssFile = relindexOf(stylesheet) != ;

  } else if (type != null) {

  isCssFile |= typeindexOf(text/css) != ;

  }

  // Handle external css files url

  if (isCssFile && href != null && hreflength() > ) {

  String innerURL = href;

  String absoluteURL = makeAbsoluteURL(strWeb innerURL);

  if (absoluteURL != null && !ntainsKey(absoluteURL)) {

  urlMapput(absoluteURL innerURL);

  ArrayList urlInfo = new ArrayList();

  urlInfoadd(innerURL);

  urlInfoadd(absoluteURL);

  urlListadd(urlInfo);

  }

  tagsetAttribute(href absoluteURL);

  }

  }

  return urlList;

  }

  /**

  *<br>方法說明抽取網頁包含的圖像鏈接

  *<br>輸入參數nodes 網頁標簽集合; urlMap 已存在的url集合

  *<br>返回類型圖像鏈接集合

  */

  private ArrayList extractAllImageNodes(NodeList nodes HashMap urlMap) {

  ArrayList urlList = new ArrayList();

  NodeList filtered = nodesextractAllNodesThatMatch(new TagNameFilter(IMG) true);

  for (int i = ; i < filteredsize(); i++) {

  Tag tag = (Tag) filteredelementAt(i);

  String src = taggetAttribute(src);

  // Handle external css files url

  if (src != null && srclength() > ) {

  String innerURL = src;

  String absoluteURL = makeAbsoluteURL(strWeb innerURL);

  if (absoluteURL != null && !ntainsKey(absoluteURL)) {

  urlMapput(absoluteURL innerURL);

  ArrayList urlInfo = new ArrayList();

  urlInfoadd(innerURL);

  urlInfoadd(absoluteURL);

  urlListadd(urlInfo);

  }

  tagsetAttribute(src absoluteURL);

  }

  }

  return urlList;

  }

  /**

  *<br>方法說明相對路徑轉絕對路徑

  *<br>輸入參數strWeb 網頁地址; innerURL 相對路徑鏈接

  *<br>返回類型絕對路徑鏈接

  */

  public static String makeAbsoluteURL(URL strWeb String innerURL) {

  // TODO Autogenerated method stub

  //去除後綴

  int pos = innerURLindexOf(?);

  if (pos != ) {

  innerURL = innerURLsubstring( pos);

  }

  if (innerURL != null

  && innerURLtoLowerCase()indexOf(http) == ) {

  Systemoutprintln(innerURL);

  return innerURL;

  }

  URL linkUri = null;

  try {

  linkUri = new URL(strWeb innerURL);

  } catch (MalformedURLException e) {

  //TODO Autogenerated catch block

  eprintStackTrace();

  return null;

  }

  String absURL = linkUritoString();

  absURL = JHtmlClearreplace(absURL / );

  absURL = JHtmlClearreplace(absURL / );

  Systemoutprintln(absURL);

  return absURL;

  }

  /**

  *<br>方法說明創建mht文件

  *<br>輸入參數content 網頁文本內容; urlScriptList 腳本鏈接集合; urlImageList 圖片鏈接集合

  *<br>返回類型

  */

  private void createMhtArchive(String content ArrayList urlScriptList ArrayList urlImageList) throws Exception {

  //Instantiate a Multipart object

  MimeMultipart mp = new MimeMultipart(related);

  Properties props = new Properties();

  propsput(mailsmtphost smtp);

  Session session = SessiongetDefaultInstance(props null);

  MimeMessage msg = new MimeMessage(session);

  // set mailer

  msgsetHeader(XMailer Code Manager SWT);

  // set from

  if (from != null) {

  msgsetFrom(new InternetAddress(from));

  }

  // set subject

  if (subject != null) {

  msgsetSubject(subject);

  }

  // to

  if (to != null) {

  InternetAddress[] toAddresses = getInetAddresses(to);

  msgsetRecipients(MessageRecipientTypeTO toAddresses);

  }

  // cc

  if (cc != null) {

  InternetAddress[] ccAddresses = getInetAddresses(cc);

  msgsetRecipients(MessageRecipientTypeCC ccAddresses);

  }

  // bcc

  if (bcc != null) {

  InternetAddress[] bccAddresses = getInetAddresses(bcc);

  msgsetRecipients(MessageRecipientTypeBCC bccAddresses);

  }

  //設置網頁正文

  MimeBodyPart bp = new MimeBodyPart();

  bpsetText(content strEncoding);

  bpaddHeader(ContentType text/html;charset= + strEncoding);

  bpaddHeader(ContentLocation strWebtoString());

  mpaddBodyPart(bp);

  int urlCount = urlScriptListsize();

  for (int i = ; i < urlCount; i++) {

  bp = new MimeBodyPart();

  ArrayList urlInfo = (ArrayList) urlScriptListget(i);

  // String url = urlInfoget()toString();

  String absoluteURL = urlInfoget()toString();

  bp

  addHeader(ContentLocation

  javaxmailinternetMimeUtility

  encodeWord(URLDecoder

  decode(absoluteURL strEncoding)));

  DataSource source = new AttachmentDataSource(absoluteURL text);

  bpsetDataHandler(new DataHandler(source));

  mpaddBodyPart(bp);

  }

  urlCount = urlImageListsize();

  for (int i = ; i < urlCount; i++) {

  bp = new MimeBodyPart();

  ArrayList urlInfo = (ArrayList) urlImageListget(i);

  // String url = urlInfoget()toString();

  String absoluteURL = urlInfoget()toString();

  bp

  addHeader(ContentLocation

  javaxmailinternetMimeUtility

  encodeWord(URLDecoder

  decode(absoluteURL strEncoding)));

  DataSource source = new AttachmentDataSource(absoluteURL image);

  bpsetDataHandler(new DataHandler(source));

  mpaddBodyPart(bp);

  }

  msgsetContent(mp);

  // write the mime multi part message to a file

  msgwriteTo(new FileOutputStream(strFileName));

  }

  /**

  *<br>方法說明mht轉html

  *<br>輸入參數strMht mht文件路徑; strHtml html文件路徑

  *<br>返回類型

  */

  public static void mhthtml(String strMht String strHtml) {

  try {

  //TODO readEmlFile

  InputStream fis = new FileInputStream(strMht);

  Session mailSession = SessiongetDefaultInstance(SystemgetProperties() null);

  MimeMessage msg = new MimeMessage(mailSession fis);

  Object content = msggetContent();

  if (content instanceof Multipart) {

  MimeMultipart mp = (MimeMultipart)content;

  MimeBodyPart bp = (MimeBodyPart)mpgetBodyPart();

  String strEncodng = getEncoding(bp);

  String strText = getHtmlText(bp strEncodng);

  if (strText == null)

  return;

  File parent = null;

  if (mpgetCount() > ) {

  parent = new File(new File(strHtml)getAbsolutePath() + files);

  parentmkdirs();

  if (!parentexists())

  return;

  }

  for (int i = ; i < mpgetCount(); ++i) {

  MimeBodyPart bp = (MimeBodyPart)mpgetBodyPart(i);

  String strUrl = getResourcesUrl(bp);

  if (strUrl == null)

  continue;

  DataHandler dataHandler = bpgetDataHandler();

  MimePartDataSource source = (MimePartDataSource)dataHandlergetDataSource();

  File resources = new File(parentgetAbsolutePath() + Fileseparator + getName(strUrl i));

  if (saveResourcesFile(resources bpgetInputStream()))

  strText = JHtmlClearreplace(strText strUrl resourcesgetAbsolutePath());

  }

  saveHtml(strText strHtml);

  }

  } catch (Exception e) {

  // TODO Autogenerated catch block

  eprintStackTrace();

  }

  }

  /**

  *<br>方法說明得到資源文件的name

  *<br>輸入參數strName 資源文件鏈接 ID 資源文件的序號

  *<br>返回類型資源文件的本地臨時文件名

  */

  public static String getName(String strName int ID) {

  char separator = /;

  Systemoutprintln(strName);

  Systemoutprintln(separator);

  if( strNamelastIndexOf(separator) >= )

  return format(strNamesubstring(strNamelastIndexOf(separator) + ));

  return temp + ID;

  }

  /**

  *<br>方法說明得到網頁編碼

  *<br>輸入參數bp MimeBodyPart類型的網頁內容

  *<br>返回類型MimeBodyPart裡的網頁內容的編碼

  */

  private static String getEncoding(MimeBodyPart bp) {

  if (bp != null) {

  try {

  Enumeration list = bpgetAllHeaders();

  while (listhasMoreElements()) {

  javaxmailHeader head = (javaxmailHeader)listnextElement();

  if (headgetName(pareTo(ContentType) == ) {

  String strType = headgetValue();

  int pos = strTypeindexOf(charset=);

  if (pos != ) {

  String strEncoding = strTypesubstring(pos + strTypelength());

  if (strEncodingtoLowerCase(pareTo(gb) == ) {

  strEncoding = gbk;

  }

  return strEncoding;

  }

  }

  }

  } catch (MessagingException e) {

  // TODO Autogenerated catch block

  eprintStackTrace();

  }

  }

  return null;

  }

  /**

  *<br>方法說明得到資源文件url

  *<br>輸入參數bp MimeBodyPart類型的網頁內容

  *<br>返回類型資源文件url

  */

  private static String getResourcesUrl(MimeBodyPart bp) {

  if (bp != null) {

  try {

  Enumeration list = bpgetAllHeaders();

  while (listhasMoreElements()) {

  javaxmailHeader head = (javaxmailHeader)listnextElement();

  if (headgetName(pareTo(ContentLocation) == ) {

  return headgetValue();

  }

  }

  } catch (MessagingException e) {

  // TODO Autogenerated catch block

  eprintStackTrace();

  }

  }

  return null;

  }

  /**

  *<br>方法說明格式化文件名

  *<br>輸入參數strName 文件名

  *<br>返回類型經過處理的符合命名規則的文件名

  */

  private static String format(String strName) {

  if (strName == null)

  return null;

  strName = strNamereplaceAll(     );

  String strText = \\/:*?\<>|^___FCKpd___quot;;

  for (int i = ; i < strNamelength(); ++i) {

  String ch = StringvalueOf(strNamecharAt(i));

  if (strTextindexOf(ch) != ) {

  strName = strNamereplace(strNamecharAt(i) );

  }

  }

  return strName;

  }

  /**

  *<br>方法說明保存資源文件

  *<br>輸入參數resources 要創建的資源文件; inputStream 要輸入文件中的流

  *<br>返回類型boolean

  */

  private static boolean saveResourcesFile(File resources InputStream inputStream) {

  if (resources == null || inputStream == null) {

  return false;

  }

  BufferedInputStream in = null;

  FileOutputStream fio = null;

  BufferedOutputStream osw = null;

  try {

  in = new BufferedInputStream(inputStream);

  fio = new FileOutputStream(resources);

  osw = new BufferedOutputStream(new DataOutputStream(fio));

  int b;

  byte[] a = new byte[];

  boolean isEmpty = true;

  while ((b = inread(a)) != ) {

  isEmpty = false;

  oswwrite(a b);

  oswflush();

  }

  oswclose();

  fioclose();

  inclose();

  inputStreamclose();

  if (isEmpty)

  resourcesdelete();

  return true;

  } catch (Exception e) {

  // TODO Autogenerated catch block

  eprintStackTrace();

  Systemoutprintln(解析mht失敗);

  return false;

  } finally{

  try {

  if (osw != null)

  oswclose();

  if (fio != null)

  fioclose();

  if (in != null)

  inclose();

  if (inputStream != null)

  inputStreamclose();

  } catch (Exception e) {

  eprintStackTrace();

  Systemoutprintln(解析mht失敗);

  return false;

  }

  }

  }

  /**

  *<br>方法說明得到mht文件的標題

  *<br>輸入參數mhtFilename mht文件名

  *<br>返回類型mht文件的標題

  */

  public static String getTitle(String mhtFilename) {

  try {

  //TODO readEmlFile

  InputStream fis = new FileInputStream(mhtFilename);

  Session mailSession = SessiongetDefaultInstance(SystemgetProperties() null);

  MimeMessage msg = new MimeMessage(mailSession fis);

  Object content = msggetContent();

  if (content instanceof Multipart) {

  MimeMultipart mp = (MimeMultipart)content;

  MimeBodyPart bp = (MimeBodyPart)mpgetBodyPart();

  String strEncodng = getEncoding(bp);

  String strText = getHtmlText(bp strEncodng);

  if (strText == null)

  return null;

  strText = strTexttoLowerCase();

  int pos = strTextindexOf(<title>);

  int pos = strTextindexOf(</title>);

  if (pos != && pos!= && pos > pos) {

  return strTextsubstring(pos + pos)trim();

  }

  }

  return null;

  } catch (Exception e) {

  // TODO Autogenerated catch block

  eprintStackTrace();

  return null;

  }

  }

  /**

  *<br>方法說明得到html文本

  *<br>輸入參數bp MimeBodyPart類型的網頁內容; strEncoding 內容編碼

  *<br>返回類型html文本

  */

  private static String getHtmlText(MimeBodyPart bp String strEncoding) {

  InputStream textStream = null;

  BufferedInputStream buff = null;

  BufferedReader br = null;

  Reader r = null;

  try {

  textStream = bpgetInputStream();

  buff = new BufferedInputStream(textStream);

  r = new InputStreamReader(buff strEncoding);

  br = new BufferedReader(r);

  StringBuffer strHtml = new StringBuffer();

  String strLine = null;

  while ((strLine = brreadLine()) != null) {

  strHtmlappend(strLine + \r\n);

  }

  brclose();

  rclose();

  textStreamclose();

  return strHtmltoString();

  } catch (Exception e) {

  // TODO Autogenerated catch block

  eprintStackTrace();

  } finally{

  try{

  if (br != null)

  brclose();

  if (buff != null)

  buffclose();

  if (textStream != null)

  textStreamclose();

  }catch(Exception e){

  Systemoutprintln(解析mht失敗);

  }

  }

  return null;

  }

  /**

  *<br>方法說明保存html文件

  *<br>輸入參數strText html內容; strHtml html文件名

  *<br>返回類型

  */

  private static void saveHtml(String strText String strHtml) {

  try {

  FileWriter fw = new FileWriter(strHtml);

  fwwrite(strText);

  fwclose();

  } catch (IOException e) {

  // TODO Autogenerated catch block

  eprintStackTrace();

  Systemoutprintln(解析mht失敗);

  }

  }

  private InternetAddress[] getInetAddresses(String emails) throws Exception {

  ArrayList list = new ArrayList();

  StringTokenizer tok = new StringTokenizer(emails );

  while (tokhasMoreTokens()) {

  listadd(toknextToken());

  }

  int count = listsize();

  InternetAddress[] addresses = new InternetAddress[count];

  for (int i = ; i < count; i++) {

  addresses[i] = new InternetAddress(listget(i)toString());

  }

  return addresses;

  }

  class AttachmentDataSource implements DataSource {

  private MimetypesFileTypeMap map = new MimetypesFileTypeMap();

  private String strUrl;

  private String strType;

  private byte[] dataSize = null;

  /**

  * This is some content type maps

  */

  private Map normalMap = new HashMap();

  {

  // Initiate normal mime type map

  // Images

  normalMapput(image image/jpeg);

  normalMapput(text text/plain);

  }

  public AttachmentDataSource(String strUrl String strType) {

  thisstrType = strType;

  thisstrUrl = strUrl;

  strUrl = strUrltrim();

  strUrl = strUrlreplaceAll( %);

  dataSize = JQuerydownBinaryFile(strUrl null);

  }

  /**

  * Returns the content type

  */

  public String getContentType() {

  return getMimeType(getName());

  }

  public String getName() {

  char separator = FileseparatorChar;

  if( strUrllastIndexOf(separator) >= )

  return strUrlsubstring(strUrllastIndexOf(separator) + );

  return strUrl;

  }

  private String getMimeType(String fileName) {

  String type = (String)normalMapget(strType);

  if (type == null) {

  try {

  type = mapgetContentType(fileName);

  } catch (Exception e) {

  // TODO: handle exception

  }

  Systemoutprintln(type);

  // Fix the null exception

  if (type == null) {

  type = application/octetstream;

  }

  }

  return type;

  }

  public InputStream getInputStream() throws IOException {

  // TODO Autogenerated method stub

  if (dataSize == null)

  dataSize = new byte[];

  return new ByteArrayInputStream(dataSize);

  }

  public OutputStream getOutputStream() throws IOException {

  // TODO Autogenerated method stub

  return new javaioByteArrayOutputStream();

  }

  }

  }


From:http://tw.wingwit.com/Article/program/Java/hx/201311/26795.html
    推薦文章
    Copyright © 2005-2013 電腦知識網 Computer Knowledge   All rights reserved.