使用java將網(wǎng)頁(yè)保存為mht格式(1)

字號(hào):

package com.tag;
    import java.io.BufferedInputStream;
    import java.io.BufferedOutputStream;
    import java.io.BufferedReader;
    import java.io.ByteArrayInputStream;
    import java.io.DataOutputStream;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.io.OutputStream;
    import java.io.Reader;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.util.*;
    import org.htmlparser.Parser;
    import org.htmlparser.Tag;
    import org.htmlparser.filters.TagNameFilter;
    import org.htmlparser.lexer.Lexer;
    import org.htmlparser.lexer.Page;
    import org.htmlparser.util.DefaultParserFeedback;
    import org.htmlparser.util.NodeList;
    import org.htmlparser.util.ParserException;
    import toptrack.tools.JQuery;
    import javax.activation.DataHandler;
    import javax.activation.DataSource;
    import javax.activation.MimetypesFileTypeMap;
    import javax.mail.Message;
    import javax.mail.MessagingException;
    import javax.mail.Multipart;
    import javax.mail.Session;
    import javax.mail.internet.InternetAddress;
    import javax.mail.internet.MimeBodyPart;
    import javax.mail.internet.MimeMessage;
    import javax.mail.internet.MimeMultipart;
    import javax.mail.internet.MimePartDataSource;
    /**
    * mht文件解析類
    * @author examda
    */
    public class Html2MHTCompiler {
    private URL strWeb = null; /**網(wǎng)頁(yè)地址*/
    private String strText = null; /**網(wǎng)頁(yè)文本內(nèi)容*/
    private String strFileName = null; /**本地文件名*/
    private String strEncoding = null; /**網(wǎng)頁(yè)編碼*/
    //mht格式附加信息
    private String from = "dongle2001@126.com";
    private String to;
    private String subject = "mht compile";
    private String cc;
    private String bcc;
    private String smtp = "localhost";
    public static void main(String[] args) {
    String strUrl = "http://www.mtime.com/my/tropicofcancer/blog/843555/";
    String strEncoding = "utf-8";
    String strText = JQuery.getHtmlText(strUrl, strEncoding, null);
    if (strText == null)
    return;
    Html2MHTCompiler h2t = new Html2MHTCompiler(strText, strUrl, strEncoding, "test.mht");
    h2t.compile();
    //Html2MHTCompiler.mht2html("test.mht", "a.html");
    }
    /**
    *方法說(shuō)明:初始化
    *輸入?yún)?shù):strText 網(wǎng)頁(yè)文本內(nèi)容; strUrl 網(wǎng)頁(yè)地址; strEncoding 網(wǎng)頁(yè)編碼; strFileName 本地文件名
    *返回類型:
    */
    public Html2MHTCompiler(String strText, String strUrl, String strEncoding, String strFileName) {
    // TODO Auto-generated constructor stub
    try {
    strWeb = new URL(strUrl);
    } catch (MalformedURLException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    return;
    }
    this.strText = strText;
    this.strEncoding = strEncoding;
    this.strFileName = strFileName;
    }
    /**
    *方法說(shuō)明:執(zhí)行下載操作
    *輸入?yún)?shù):
    *返回類型:
    */public boolean compile() {
    if (strWeb == null || strText == null || strFileName == null || strEncoding == null)
    return false;
    HashMap urlMap = new HashMap();
    NodeList nodes = new NodeList();
    try {
    Parser parser = createParser(strText);
    parser.setEncoding(strEncoding);
    nodes = parser.parse(null);
    } catch (ParserException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    }
    extractAllScriptNodes(nodes);
    ArrayList urlScriptList = extractAllScriptNodes(nodes, urlMap);
    ArrayList urlImageList = extractAllImageNodes(nodes, urlMap);
    for (Iterator iter = urlMap.entrySet().iterator(); iter.hasNext();) {
    Map.Entry entry = (Map.Entry) iter.next();
    String key = (String)entry.getKey();
    String val = (String)entry.getValue();
    strText = JHtmlClear.replace(strText, val, key);
    }
    try {
    createMhtArchive(strText, urlScriptList, urlImageList);
    } catch (Exception e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    return false;
    }
    return true;
    }
    /**
    *方法說(shuō)明:建立HTML parser
    *輸入?yún)?shù):inputHTML 網(wǎng)頁(yè)文本內(nèi)容
    *返回類型:HTML parser
    */
    private Parser createParser(String inputHTML) {
    // TODO Auto-generated method stub
    Lexer mLexer = new Lexer(new Page(inputHTML));
    return new Parser(mLexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
    }
    /**
    *方法說(shuō)明:抽取基礎(chǔ)URL地址
    *輸入?yún)?shù):nodes 網(wǎng)頁(yè)標(biāo)簽集合
    *返回類型:
    */
    private void extractAllScriptNodes(NodeList nodes) {
    NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(
    "BASE"), true);
    if (filtered != null && filtered.size() > 0) {
    Tag tag = (Tag) filtered.elementAt(0);
    String href = tag.getAttribute("href");
    if (href != null && href.length() > 0) {
    try {
    strWeb = new URL(href);
    } catch (MalformedURLException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    }
    }
    }
    }
    /**
    *方法說(shuō)明:抽取網(wǎng)頁(yè)包含的css,js鏈接
    *輸入?yún)?shù):nodes 網(wǎng)頁(yè)標(biāo)簽集合; urlMap 已存在的url集合
    *返回類型:css,js鏈接的集合
    */
    private ArrayList extractAllScriptNodes(NodeList nodes, HashMap urlMap) {
    ArrayList urlList = new ArrayList();
    NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("script"), true);
    for (int i = 0; i < filtered.size(); i++) {
    Tag tag = (Tag) filtered.elementAt(i);
    String src = tag.getAttribute("src");
    // Handle external css file’s url
    if (src != null && src.length() > 0) {
    String innerURL = src;
    String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
    if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
    urlMap.put(absoluteURL, innerURL);
    ArrayList urlInfo = new ArrayList();
    urlInfo.add(innerURL);
    urlInfo.add(absoluteURL);
    urlList.add(urlInfo);
    }
    tag.setAttribute("src", absoluteURL);
    }
    }
    filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("link"), true);
    for (int i = 0; i < filtered.size(); i++) {
    Tag tag = (Tag) filtered.elementAt(i);
    String type = (tag.getAttribute("type"));
    String rel = (tag.getAttribute("rel"));
    String href = tag.getAttribute("href");
    boolean isCssFile = false;
    if (rel != null) {
    isCssFile = rel.indexOf("stylesheet") != -1;
    } else if (type != null) {
    isCssFile |= type.indexOf("text/css") != -1;
    }
    // Handle external css file’s url
    if (isCssFile && href != null && href.length() > 0) {
    String innerURL = href;
    String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
    if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
    urlMap.put(absoluteURL, innerURL);
    ArrayList urlInfo = new ArrayList();
    urlInfo.add(innerURL);
    urlInfo.add(absoluteURL);
    urlList.add(urlInfo);
    }
    tag.setAttribute("href", absoluteURL);
    }
    }
    return urlList;
    }
    /**
    *方法說(shuō)明:抽取網(wǎng)頁(yè)包含的圖像鏈接
    *輸入?yún)?shù):nodes 網(wǎng)頁(yè)標(biāo)簽集合; urlMap 已存在的url集合
    *返回類型:圖像鏈接集合
    */private ArrayList extractAllImageNodes(NodeList nodes, HashMap urlMap) {
    ArrayList urlList = new ArrayList();
    NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("IMG"), true);
    for (int i = 0; i < filtered.size(); i++) {
    Tag tag = (Tag) filtered.elementAt(i);
    String src = tag.getAttribute("src");
    // Handle external css file’s url
    if (src != null && src.length() > 0) {
    String innerURL = src;
    String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
    if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
    urlMap.put(absoluteURL, innerURL);
    ArrayList urlInfo = new ArrayList();
    urlInfo.add(innerURL);
    urlInfo.add(absoluteURL);
    urlList.add(urlInfo);
    }
    tag.setAttribute("src", absoluteURL);
    }
    }
    return urlList;
    }
    /**
    *方法說(shuō)明:相對(duì)路徑轉(zhuǎn)絕對(duì)路徑
    *輸入?yún)?shù):strWeb 網(wǎng)頁(yè)地址; innerURL 相對(duì)路徑鏈接
    *返回類型:絕對(duì)路徑鏈接
    */
    public static String makeAbsoluteURL(URL strWeb, String innerURL) {
    // TODO Auto-generated method stub
    //去除后綴
    int pos = innerURL.indexOf("?");
    if (pos != -1) {
    innerURL = innerURL.substring(0, pos);
    }
    if (innerURL != null
    && innerURL.toLowerCase().indexOf("http") == 0) {
    System.out.println(innerURL);
    return innerURL;
    }
    URL linkUri = null;
    try {
    linkUri = new URL(strWeb, innerURL);
    } catch (MalformedURLException e) {
    //TODO Auto-generated catch block
    e.printStackTrace();
    return null;
    }
    String absURL = linkUri.toString();
    absURL = JHtmlClear.replace(absURL, "../", "");
    absURL = JHtmlClear.replace(absURL, "./", "");
    System.out.println(absURL);
    return absURL;
    }
    /**
    *方法說(shuō)明:創(chuàng)建mht文件
    *輸入?yún)?shù):content 網(wǎng)頁(yè)文本內(nèi)容; urlScriptList 腳本鏈接集合; urlImageList 圖片鏈接集合
    *返回類型:
    */
    private void createMhtArchive(String content, ArrayList urlScriptList, ArrayList urlImageList) throws Exception {
    //Instantiate a Multipart object
    MimeMultipart mp = new MimeMultipart("related");
    Properties props = new Properties();
    props.put("mail.smtp.host", smtp);
    Session session = Session.getDefaultInstance(props, null);
    MimeMessage msg = new MimeMessage(session);
    // set mailer
    msg.setHeader("X-Mailer", "Code Manager .SWT");
    // set from
    if (from != null) {
    msg.setFrom(new InternetAddress(from));
    }
    // set subject
    if (subject != null) {
    msg.setSubject(subject);
    }
    // to
    if (to != null) {
    InternetAddress[] toAddresses = getInetAddresses(to);
    msg.setRecipients(Message.RecipientType.TO, toAddresses);
    }
    // cc
    if (cc != null) {
    InternetAddress[] ccAddresses = getInetAddresses(cc);
    msg.setRecipients(Message.RecipientType.CC, ccAddresses);
    }
    // bcc
    if (bcc != null) {
    InternetAddress[] bccAddresses = getInetAddresses(bcc);
    msg.setRecipients(Message.RecipientType.BCC, bccAddresses);
    }