加入 jsoup 和 htmlunit 的依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.25</version>
</dependency>
代码:
package com.jm.bigdata.util;import java.io.IOException;
import java.util.logging.Level;import org.apache.htrace.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;public class ReptileTools { /**
* 使用htmlunit模拟Chrome并获取全部网页信息
* @param phoneNumber
* @return
*/
public static String searchMobile2(String cookie,String DownloadUrl) { String title="";
Document doc = null;
try {
//构造一个webClient 模拟Chrome 浏览器
WebClient webClient = new WebClient(BrowserVersion.CHROME);
//屏蔽日志信息
LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog");
java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
//支持JavaScript
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setTimeout(5000);
HtmlPage rootPage = webClient.getPage(DownloadUrl);
//设置一个运行JavaScript的时间
webClient.waitForBackgroundJavaScript(5000);
String html = rootPage.asXml();
doc = Jsoup.parse(html);
System.out.println(doc);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
return null;
}
return title;
}
}
这样我们就可以得到一个包含运行 JavaScript 之后的完整源网页了