-
JAVA爬虫工具类
2020-03-04 17:36:39封装了一个JAVA爬虫工具类。 1.maven引用jar <dependency> <groupId>net.sourceforge.htmlunit</groupId> <artifactId>htmlunit</artifactId> <version>2.27</...封装了一个JAVA爬虫工具类。
1.maven引用jar
<dependency> <groupId>net.sourceforge.htmlunit</groupId> <artifactId>htmlunit</artifactId> <version>2.27</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.8.3</version> </dependency>
2工具类
public class HttpHtmlUnit { /** * 请求超时时间,默认20000ms */ private int timeout = 20000; /** * 等待异步JS执行时间,默认20000ms */ private int waitForBackgroundJavaScript = 20000; /** * cookie表 */ private Map<String, String> cookieMap = new HashMap<>(); /** * 请求编码(处理返回结果),默认UTF-8 */ private String charset = "UTF-8"; private static HttpHtmlUnit httpUtils; private HttpHtmlUnit() { } /** * 获取实例 * * @return */ public static HttpHtmlUnit getInstance() { if (httpUtils == null) httpUtils = new HttpHtmlUnit(); return httpUtils; } /** * 清空cookieMap */ public void invalidCookieMap() { cookieMap.clear(); } public int getTimeout() { return timeout; } /** * 设置请求超时时间 * * @param timeout */ public void setTimeout(int timeout) { this.timeout = timeout; } public String getCharset() { return charset; } /** * 设置请求字符编码集 * * @param charset */ public void setCharset(String charset) { this.charset = charset; } public int getWaitForBackgroundJavaScript() { return waitForBackgroundJavaScript; } /** * 设置获取完整HTML页面时等待异步JS执行的时间 * * @param waitForBackgroundJavaScript */ public void setWaitForBackgroundJavaScript(int waitForBackgroundJavaScript) { this.waitForBackgroundJavaScript = waitForBackgroundJavaScript; } /** * 将网页返回为解析后的文档格式 * * @param html * @return * @throws Exception */ public static Document parseHtmlToDoc(String html) throws Exception { return removeHtmlSpace(html); } private static Document removeHtmlSpace(String str) { Document doc = Jsoup.parse(str); String result = doc.html().replace(" ", ""); return Jsoup.parse(result); } /** * 执行get请求,返回doc * * @param url * @return * @throws Exception */ public Document executeGetAsDocument(String url) throws Exception { return parseHtmlToDoc(executeGet(url)); } /** * 执行get请求 * * @param url * @return * @throws Exception */ public String executeGet(String url) throws Exception { HttpGet httpGet = new HttpGet(url); httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap)); httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build()); CloseableHttpClient httpClient = null; String str = ""; try { httpClient = HttpClientBuilder.create().build(); HttpClientContext context = HttpClientContext.create(); CloseableHttpResponse response = httpClient.execute(httpGet, context); getCookiesFromCookieStore(context.getCookieStore(), cookieMap); int state = response.getStatusLine().getStatusCode(); if (state == 404) { str = ""; } try { HttpEntity entity = response.getEntity(); if (entity != null) { str = EntityUtils.toString(entity, charset); } } finally { response.close(); } } catch (IOException e) { throw e; } finally { try { if (httpClient != null) httpClient.close(); } catch (IOException e) { throw e; } } return str; } /** * 用https执行get请求,返回doc * * @param url * @return * @throws Exception */ public Document executeGetWithSSLAsDocument(String url) throws Exception { return parseHtmlToDoc(executeGetWithSSL(url)); } /** * 用https执行get请求 * * @param url * @return * @throws Exception */ public String executeGetWithSSL(String url) throws Exception { HttpGet httpGet = new HttpGet(url); httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap)); httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build()); CloseableHttpClient httpClient = null; String str = ""; try { httpClient = createSSLInsecureClient(); HttpClientContext context = HttpClientContext.create(); CloseableHttpResponse response = httpClient.execute(httpGet, context); getCookiesFromCookieStore(context.getCookieStore(), cookieMap); int state = response.getStatusLine().getStatusCode(); if (state == 404) { str = ""; } try { HttpEntity entity = response.getEntity(); if (entity != null) { str = EntityUtils.toString(entity, charset); } } finally { response.close(); } } catch (IOException e) { throw e; } catch (GeneralSecurityException ex) { throw ex; } finally { try { if (httpClient != null) httpClient.close(); } catch (IOException e) { throw e; } } return str; } /** * 执行post请求,返回doc * * @param url * @param params * @return * @throws Exception */ public Document executePostAsDocument(String url, Map<String, String> params) throws Exception { return parseHtmlToDoc(executePost(url, params)); } /** * 执行post请求 * * @param url * @param params * @return * @throws Exception */ public String executePost(String url, Map<String, String> params) throws Exception { String reStr = ""; HttpPost httpPost = new HttpPost(url); httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build()); httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap)); List<NameValuePair> paramsRe = new ArrayList<>(); for (Map.Entry<String, String> entry : params.entrySet()) { paramsRe.add(new BasicNameValuePair(entry.getKey(), entry.getValue())); } CloseableHttpClient httpclient = HttpClientBuilder.create().build(); CloseableHttpResponse response; try { httpPost.setEntity(new UrlEncodedFormEntity(paramsRe)); HttpClientContext context = HttpClientContext.create(); response = httpclient.execute(httpPost, context); getCookiesFromCookieStore(context.getCookieStore(), cookieMap); HttpEntity entity = response.getEntity(); reStr = EntityUtils.toString(entity, charset); } catch (IOException e) { throw e; } finally { httpPost.releaseConnection(); } return reStr; } /** * 用https执行post请求,返回doc * * @param url * @param params * @return * @throws Exception */ public Document executePostWithSSLAsDocument(String url, Map<String, String> params) throws Exception { return parseHtmlToDoc(executePostWithSSL(url, params)); } /** * 用https执行post请求 * * @param url * @param params * @return * @throws Exception */ public String executePostWithSSL(String url, Map<String, String> params) throws Exception { String re = ""; HttpPost post = new HttpPost(url); List<NameValuePair> paramsRe = new ArrayList<>(); for (Map.Entry<String, String> entry : params.entrySet()) { paramsRe.add(new BasicNameValuePair(entry.getKey(), entry.getValue())); } post.setHeader("Cookie", convertCookieMapToString(cookieMap)); post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build()); CloseableHttpResponse response; try { CloseableHttpClient httpClientRe = createSSLInsecureClient(); HttpClientContext contextRe = HttpClientContext.create(); post.setEntity(new UrlEncodedFormEntity(paramsRe)); response = httpClientRe.execute(post, contextRe); HttpEntity entity = response.getEntity(); if (entity != null) { re = EntityUtils.toString(entity, charset); } getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap); } catch (Exception e) { throw e; } return re; } /** * 发送JSON格式body的POST请求 * * @param url 地址 * @param jsonBody json body * @return * @throws Exception */ public String executePostWithJson(String url, String jsonBody) throws Exception { String reStr = ""; HttpPost httpPost = new HttpPost(url); httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build()); httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap)); CloseableHttpClient httpclient = HttpClientBuilder.create().build(); CloseableHttpResponse response; try { httpPost.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON)); HttpClientContext context = HttpClientContext.create(); response = httpclient.execute(httpPost, context); getCookiesFromCookieStore(context.getCookieStore(), cookieMap); HttpEntity entity = response.getEntity(); reStr = EntityUtils.toString(entity, charset); } catch (IOException e) { throw e; } finally { httpPost.releaseConnection(); } return reStr; } /** * 发送JSON格式body的SSL POST请求 * * @param url 地址 * @param jsonBody json body * @return * @throws Exception */ public String executePostWithJsonAndSSL(String url, String jsonBody) throws Exception { String re = ""; HttpPost post = new HttpPost(url); post.setHeader("Cookie", convertCookieMapToString(cookieMap)); post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build()); CloseableHttpResponse response; try { CloseableHttpClient httpClientRe = createSSLInsecureClient(); HttpClientContext contextRe = HttpClientContext.create(); post.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON)); response = httpClientRe.execute(post, contextRe); HttpEntity entity = response.getEntity(); if (entity != null) { re = EntityUtils.toString(entity, charset); } getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap); } catch (Exception e) { throw e; } return re; } /** * 获取页面文档字串(等待异步JS执行) * * @param url 页面URL * @return * @throws Exception */ public String getHtmlPageResponse(String url) throws Exception { String result = ""; final WebClient webClient = new WebClient(BrowserVersion.CHROME); webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常 webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常 webClient.getOptions().setActiveXNative(true); webClient.getOptions().setCssEnabled(true);//是否启用CSS webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JS webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX webClient.getOptions().setTimeout(timeout);//设置“浏览器”的请求超时时间 webClient.setJavaScriptTimeout(timeout);//设置JS执行的超时时间 HtmlPage page; try { page = webClient.getPage(url); } catch (Exception e) { webClient.close(); throw e; } webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//该方法阻塞线程 result = page.asXml(); webClient.close(); return result; } /** * 获取页面文档Document对象(等待异步JS执行) * * @param url 页面URL * @return * @throws Exception */ public Document getHtmlPageResponseAsDocument(String url) throws Exception { return parseHtmlToDoc(getHtmlPageResponse(url)); } private void getCookiesFromCookieStore(CookieStore cookieStore, Map<String, String> cookieMap) { List<Cookie> cookies = cookieStore.getCookies(); for (Cookie cookie : cookies) { cookieMap.put(cookie.getName(), cookie.getValue()); } } private String convertCookieMapToString(Map<String, String> map) { String cookie = ""; for (Map.Entry<String, String> entry : map.entrySet()) { cookie += (entry.getKey() + "=" + entry.getValue() + "; "); } if (map.size() > 0) { cookie = cookie.substring(0, cookie.length() - 2); } return cookie; } /** * 创建 SSL连接 * * @return * @throws GeneralSecurityException */ private static CloseableHttpClient createSSLInsecureClient() throws GeneralSecurityException { try { SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, (chain, authType) -> true).build(); SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(sslContext, (s, sslContextL) -> true); return HttpClients.custom().setSSLSocketFactory(sslConnectionSocketFactory).build(); } catch (GeneralSecurityException e) { throw e; } }
}
3.遇到的问题:
htmlunit引用common-io的版本较低。如果项目中其它地方有应用common-io较高版本,版本冲突会导致问题。处理版本冲突可参照maven引用依赖原则,pom文件中放置在较前位置的版本先被引用到处理。 -
MinerUtil.java 爬虫工具类
2019-09-20 15:59:27MinerUtil.java 爬虫工具类 package com.iteye.injavawetrust.miner; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java....package com.iteye.injavawetrust.miner; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TimeZone; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * 爬虫工具类 * @author InJavaWeTrust * */ public class MinerUtil { private static final Log LOG = LogFactory.getLog(MinerUtil.class); public static long starTime = 0; /** * 判断是否为空 * @param param * @return true-为空;false-非空 */ public static boolean isBlank(String param) { return (null == param || "".equals(param.trim())) ? true : false; } /** * URL是否以html结尾 * @param url * @return true-是;false-否 */ public static boolean checkURL(String url) { String html = url.substring(url.lastIndexOf(".") + 1); return "html".equals(html) ? true : false; } /** * URL列表是否包含关键字 * @param key 关键字 * @param keys URL列表 * @return true-是;false-否 */ public static boolean checkKeys(String key, List<String> keys) { boolean flag = false; for(String k : keys) { if(key.contains(k)){ flag = true; break; } } return flag; } public static boolean isValidFileName(String fileName) { if (fileName == null || fileName.length() > 255){ return false; } else { return fileName .matches("[^\\s\\\\/:\\*\\?\\\"<>\\|](\\x20|[^\\s\\\\/:\\*\\?\\\"<>\\|])*[^\\s\\\\/:\\*\\?\\\"<>\\|\\.]$"); } } /** * 获取URL * @param url URL * @return URL */ public static Set<String> getAllUrl(String url){ Set<String> urls = new HashSet<String>(); try { Connection conn = Jsoup.connect(url); conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13");//模拟浏览器 Document document = conn.timeout(5000).get(); Elements hrefs = document.select("a[href]"); Iterator<Element> hrefIter = hrefs.iterator(); while (hrefIter.hasNext()) { Element href = hrefIter.next(); urls.add(href.attr("href")); } } catch (Exception e) { LOG.info("获取URL出现异常,异常URL[" + url + "]"); LOG.info("异常信息[" + e.getMessage() + "]"); } return urls; } /** * 毫秒转换成hhmmss * @param ms 毫秒 * @return hh:mm:ss */ public static String msToss(long ms) { SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss"); formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00")); String ss = formatter.format(ms); return ss; } /** * 将html写入本地文件 * @param htmlText html内容 * @param htmlName html名称 */ public static void getHtmlToLocal(Map<String, String> map){ Writer writer = null; try { String path = MinerConstanits.HTMLPATH + getToday(); makeDir(path); writer = new OutputStreamWriter(new FileOutputStream(new File(path + File.separator + map.get("title"))), "UTF-8"); writer.write(map.get("html")); writer.flush(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) { try { writer.close(); } catch (IOException e) { e.printStackTrace(); } } } } /** * 文件名不能包含下列任何字符:<br> * \/:*?"<>| * @param title 标题 * @return 去掉文件名不能包含的字符 */ public static String fileName(String title){ return title .replaceAll("\\\\", "") .replaceAll("/", "") .replaceAll(":", "") .replaceAll("\\*", "") .replaceAll("\\?", "") .replaceAll("\"", "") .replaceAll("<", "") .replaceAll(">", "") .replaceAll("\\|", ""); } /** * 获取当天日期 * @return 当天日期 */ public static String getToday(){ String result = ""; Date date = new Date(); result = format(date); return result; } /** * 格式化日期 * @param date 日期 * @return yyyymmdd 日期 */ public static String format(Date date){ String format = "yyyyMMdd"; SimpleDateFormat fmt = new SimpleDateFormat(format); return fmt.format(date); } /** * 创建存储目录 * @param path 存储目录 */ public static void makeDir(String path) { File file = new File(path); if(!file.exists()){ file.mkdirs(); LOG.info("创建存储目录[" + path + "]"); } } public static boolean checkBeforeStart(MinerConfig config) { if(null == config){ LOG.info("config未配置!!!"); return false; } if(null == config.getKeys() || 0 == config.getKeys().size()){ LOG.info("包含关键字未配置!!!"); return false; } if(null == config.getStoreType()){ LOG.info("存储方式未配置!!!"); return false; } if(config.getMaxDepth() < 1){ LOG.info("爬取页面最大深度配置错误!!!"); return false; } if(config.getMinerHtmlThreadNum() < 1){ LOG.info("下载页面线程数配置错误!!!"); return false; } if(config.getMiseringThreadNum() < 1){ LOG.info("分析页面线程数配置错误!!!"); return false; } if(config.getMinserStoreThreadNum() < 1){ LOG.info("存储线程数配置错误!!!"); return false; } return true; } public static void main(String[] args) { String path = MinerConstanits.HTMLPATH + File.separator + getToday(); makeDir(path); // System.out.println(getToday()); // String test = "http://my.163.com/2015/11/27/17763_578935.html"; // System.out.println(fileName(test)); // System.out.println(MinerUtil.isBlank(null)); // System.out.println(MinerUtil.isBlank("")); // System.out.println(MinerUtil.isBlank(" ")); // System.out.println(MinerUtil.isBlank("bbb")); // System.out.println(MinerUtil.isBlank(" bbb ")); // String key = "http://www.jqu.net.cn"; // List<String> keys = new ArrayList<String>(); // keys.add("http://www.jqu.net.cn"); // System.out.println(MinerUtil.checkKeys(key, keys)); } }
-
java爬虫技术工具类_MinerUtil.java 爬虫工具类
2021-03-12 10:44:16package ...import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.OutputStreamWriter;import java....package com.iteye.injavawetrust.miner;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TimeZone;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 爬虫工具类
* @author InJavaWeTrust
*
*/
public class MinerUtil {
private static final Log LOG = LogFactory.getLog(MinerUtil.class);
public static long starTime = 0;
/**
* 判断是否为空
* @param param
* @return true-为空;false-非空
*/
public static boolean isBlank(String param) {
return (null == param || "".equals(param.trim())) ? true : false;
}
/**
* URL是否以html结尾
* @param url
* @return true-是;false-否
*/
public static boolean checkURL(String url) {
String html = url.substring(url.lastIndexOf(".") + 1);
return "html".equals(html) ? true : false;
}
/**
* URL列表是否包含关键字
* @param key 关键字
* @param keys URL列表
* @return true-是;false-否
*/
public static boolean checkKeys(String key, List keys) {
boolean flag = false;
for(String k : keys) {
if(key.contains(k)){
flag = true;
break;
}
}
return flag;
}
public static boolean isValidFileName(String fileName) {
if (fileName == null || fileName.length() > 255){
return false;
} else {
return fileName
.matches("[^\\s\\\\/:\\*\\?\\\"<>\\|](\\x20|[^\\s\\\\/:\\*\\?\\\"<>\\|])*[^\\s\\\\/:\\*\\?\\\"<>\\|\\.]$");
}
}
/**
* 获取URL
* @param url URL
* @return URL
*/
public static Set getAllUrl(String url){
Set urls = new HashSet();
try {
Connection conn = Jsoup.connect(url);
conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13");//模拟浏览器
Document document = conn.timeout(5000).get();
Elements hrefs = document.select("a[href]");
Iterator hrefIter = hrefs.iterator();
while (hrefIter.hasNext()) {
Element href = hrefIter.next();
urls.add(href.attr("href"));
}
} catch (Exception e) {
LOG.info("获取URL出现异常,异常URL[" + url + "]");
LOG.info("异常信息[" + e.getMessage() + "]");
}
return urls;
}
/**
* 毫秒转换成hhmmss
* @param ms 毫秒
* @return hh:mm:ss
*/
public static String msToss(long ms) {
SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss");
formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00"));
String ss = formatter.format(ms);
return ss;
}
/**
* 将html写入本地文件
* @param htmlText html内容
* @param htmlName html名称
*/
public static void getHtmlToLocal(Map map){
Writer writer = null;
try {
String path = MinerConstanits.HTMLPATH + getToday();
makeDir(path);
writer = new OutputStreamWriter(new FileOutputStream(new File(path
+ File.separator + map.get("title"))), "UTF-8");
writer.write(map.get("html"));
writer.flush();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (writer != null) {
try {
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 文件名不能包含下列任何字符:
* \/:*?"<>|
* @param title 标题
* @return 去掉文件名不能包含的字符
*/
public static String fileName(String title){
return title
.replaceAll("\\\\", "")
.replaceAll("/", "")
.replaceAll(":", "")
.replaceAll("\\*", "")
.replaceAll("\\?", "")
.replaceAll("\"", "")
.replaceAll("
.replaceAll(">", "")
.replaceAll("\\|", "");
}
/**
* 获取当天日期
* @return 当天日期
*/
public static String getToday(){
String result = "";
Date date = new Date();
result = format(date);
return result;
}
/**
* 格式化日期
* @param date 日期
* @return yyyymmdd 日期
*/
public static String format(Date date){
String format = "yyyyMMdd";
SimpleDateFormat fmt = new SimpleDateFormat(format);
return fmt.format(date);
}
/**
* 创建存储目录
* @param path 存储目录
*/
public static void makeDir(String path) {
File file = new File(path);
if(!file.exists()){
file.mkdirs();
LOG.info("创建存储目录[" + path + "]");
}
}
public static boolean checkBeforeStart(MinerConfig config) {
if(null == config){
LOG.info("config未配置!!!");
return false;
}
if(null == config.getKeys() || 0 == config.getKeys().size()){
LOG.info("包含关键字未配置!!!");
return false;
}
if(null == config.getStoreType()){
LOG.info("存储方式未配置!!!");
return false;
}
if(config.getMaxDepth() < 1){
LOG.info("爬取页面最大深度配置错误!!!");
return false;
}
if(config.getMinerHtmlThreadNum() < 1){
LOG.info("下载页面线程数配置错误!!!");
return false;
}
if(config.getMiseringThreadNum() < 1){
LOG.info("分析页面线程数配置错误!!!");
return false;
}
if(config.getMinserStoreThreadNum() < 1){
LOG.info("存储线程数配置错误!!!");
return false;
}
return true;
}
public static void main(String[] args) {
String path = MinerConstanits.HTMLPATH + File.separator + getToday();
makeDir(path);
//System.out.println(getToday());
//String test = "http://my.163.com/2015/11/27/17763_578935.html";
//System.out.println(fileName(test));
//System.out.println(MinerUtil.isBlank(null));
//System.out.println(MinerUtil.isBlank(""));
//System.out.println(MinerUtil.isBlank(" "));
//System.out.println(MinerUtil.isBlank("bbb"));
//System.out.println(MinerUtil.isBlank(" bbb "));
//String key = "http://www.jqu.net.cn";
//List keys = new ArrayList();
//keys.add("http://www.jqu.net.cn");
//System.out.println(MinerUtil.checkKeys(key, keys));
}
}
-
Java网页爬虫--基于URLConnection的网页爬虫工具类
2018-12-26 09:56:31在这个数据为王的时代,爬虫应用地越来越广泛,对于一个萌新程序员来说如果...身为一块Java老腊肉的我在此记录一下自己在使用Java做网络爬虫使用的工具类。 在pom.xml文件中引入commons-lang3 依赖: <depe...在这个数据为王的时代,爬虫应用地越来越广泛,对于一个萌新程序员来说如果你要做爬虫,那么Python是你的不二之选。但是对于那些老腊肉的Java程序员(亦或者你是程序媛)想使用Java做爬虫也不是不行,只是没有Python那么方便。身为一块Java老腊肉的我在此记录一下自己在使用Java做网络爬虫使用的工具类。
在pom.xml文件中引入commons-lang3 依赖:
<dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> <version>3.6</version> </dependency>
SpiderHttpUtils 工具类完整代码如下:
import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.net.URLEncoder; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.util.Map; import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSocketFactory; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; import org.apache.commons.lang3.StringUtils; public class SpiderHttpUtils { public static String sendGet(boolean isHttps, String requestUrl, Map<String, String> params, Map<String, String> headers, String charSet) { if (StringUtils.isBlank(requestUrl)) { return ""; } if (StringUtils.isBlank(charSet)) { charSet = "UTF-8"; } URL url = null; URLConnection conn = null; BufferedReader br = null; try { // 创建连接 url = new URL(requestUrl + "?" + requestParamsBuild(params)); if (isHttps) { conn = getHttpsUrlConnection(url); } else { conn = (HttpURLConnection) url.openConnection(); } // 设置请求头通用属性 // 指定客户端能够接收的内容类型 conn.setRequestProperty("Accept", "*/*"); // 设置连接的状态为长连接 conn.setRequestProperty("Connection", "keep-alive"); // 设置发送请求的客户机系统信息 conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"); // 设置请求头自定义属性 if (null != headers && headers.size() > 0) { for (Map.Entry<String, String> entry : headers.entrySet()) { conn.setRequestProperty(entry.getKey(), entry.getValue()); } } // 设置其他属性 // conn.setUseCaches(false);//不使用缓存 // conn.setReadTimeout(10000);// 设置读取超时时间 // conn.setConnectTimeout(10000);// 设置连接超时时间 // 建立实际连接 conn.connect(); // 读取请求结果 br = new BufferedReader(new InputStreamReader(conn.getInputStream(), charSet)); String line = null; StringBuilder sb = new StringBuilder(); while ((line = br.readLine()) != null) { sb.append(line); } return sb.toString(); } catch (Exception exception) { return ""; } finally { try { if (br != null) { br.close(); } } catch (Exception e) { e.printStackTrace(); } } } public static String requestParamsBuild(Map<String, String> map) { String result = ""; if (null != map && map.size() > 0) { StringBuffer sb = new StringBuffer(); for (Map.Entry<String, String> entry : map.entrySet()) { try { String value = URLEncoder.encode(entry.getValue(), "UTF-8"); sb.append(entry.getKey() + "=" + value + "&"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } } result = sb.substring(0, sb.length() - 1); } return result; } private static HttpsURLConnection getHttpsUrlConnection(URL url) throws Exception { HttpsURLConnection httpsConn = (HttpsURLConnection) url.openConnection(); // 创建SSLContext对象,并使用我们指定的信任管理器初始化 TrustManager[] tm = { new X509TrustManager() { public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { // 检查客户端证书 } public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { // 检查服务器端证书 } public X509Certificate[] getAcceptedIssuers() { // 返回受信任的X509证书数组 return null; } } }; SSLContext sslContext = SSLContext.getInstance("SSL", "SunJSSE"); sslContext.init(null, tm, new java.security.SecureRandom()); // 从上述SSLContext对象中得到SSLSocketFactory对象 SSLSocketFactory ssf = sslContext.getSocketFactory(); httpsConn.setSSLSocketFactory(ssf); return httpsConn; } public static byte[] getFileAsByte(boolean isHttps, String requestUrl) { if (StringUtils.isBlank(requestUrl)) { return new byte[0]; } URL url = null; URLConnection conn = null; BufferedInputStream bi = null; try { // 创建连接 url = new URL(requestUrl); if (isHttps) { conn = getHttpsUrlConnection(url); } else { conn = (HttpURLConnection) url.openConnection(); } // 设置请求头通用属性 // 指定客户端能够接收的内容类型 conn.setRequestProperty("accept", "*/*"); // 设置连接的状态为长连接 conn.setRequestProperty("Connection", "keep-alive"); // 设置发送请求的客户机系统信息 conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); // 设置其他属性 conn.setConnectTimeout(3000);// 设置连接超时时间 conn.setDoOutput(true); conn.setDoInput(true); // 建立实际连接 conn.connect(); // 读取请求结果 bi = new BufferedInputStream(conn.getInputStream()); ByteArrayOutputStream outStream = new ByteArrayOutputStream(); byte[] buffer = new byte[2048]; int len = 0; while ((len = bi.read(buffer)) != -1) { outStream.write(buffer, 0, len); } bi.close(); byte[] data = outStream.toByteArray(); return data; } catch (Exception exception) { return new byte[0]; } finally { try { if (bi != null) { bi.close(); } } catch (Exception e) { e.printStackTrace(); } } } }
-
curl命令java_自用Java爬虫工具JAVA-CURL已开源
2020-12-20 14:04:06工具和资料简介CUrl类是以命令行工具CUrl为参考,使用标准Java的HttpURLConnection实现的Http工具类。特点基于标准Java运行库的Http类实现,源码兼容级别为1.6,适用性广泛,可用于服务端、Android等Java环境代码... -
自用Java爬虫工具JAVA-CURL已开源
2018-11-20 17:21:06项目地址: https://github.com/rockswang/...中央仓库: ...简介 CUrl类是以命令行工具CUrl为参考,使用标准Java的HttpURLConnection实现的Http工具类。 特点 基于标准Java运行库的Http类实现,源码兼容级别为1... -
Java 爬虫工具/开源API对比
2015-06-09 15:03:36注:基于网上收集到的信息,未经代码试验过。 名称 分类 优点 缺点 Git 评价 Apache Nutch 搜索引擎 分布式(依赖hadoop),...爬虫 微内核+插件式架构,重配置(无需写代码),多线程 用户太少 star 525 | for -
java爬虫实例教程_Java爬虫的实例分析
2021-03-12 23:21:30首先,看完这篇文章,不能保证你...先上代码,在一步一步讲解:这是一个工具类,不用详细看,网上哪里都能找到发送http请求的工具类,少包自己导package com.df.util;import java.io.BufferedReader;import java.io... -
java 爬虫 js_Java 爬虫遇上数据异步加载
2021-02-28 16:12:50原标题:Java 爬虫遇上数据异步加载现在很多都是前后端分离项目,这会使得数据异步加载问题更加突出,所以你在爬虫时遇到这类问题不必惊讶,不必慌张。对于这类问题的解决办法总体来说有以下两种:1、内置一个浏览器... -
java爬虫demo
2019-04-02 15:04:06java爬虫demo网络爬虫的基本概念网络爬虫的分类网页内容获取工具 jsoupjsoup 解析 URL 加载的 Documentjsoup 使用中的遍历jsoup 选择器的使用网页内容获取工具 HttpClientHttpClient 相关 Jar 的下载HttpClient 的... -
【Java】爬虫抓取数据 工具类封装 完整代码分享
2021-01-23 16:46:06工具类封装 import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.Iterator; public class BaseCrawl { public static String ... -
java爬虫防屏蔽_Java爬虫技术之绕过百度云防护抓取网站内容
2021-03-11 13:32:21如图: 首先需要一个Http工具类:HttpHandlepackage org.coody.robot.util;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.InputStream;import java.net.HttpURLConnection;... -
Java 爬虫实战
2020-03-30 16:24:15一、项目需要 对某类网站的数据进行数据采集 ... 爬虫工具:webmagic 数据库:Mysql 定时器:@Scheduled 开启 cron在心解析地址:http://cron.qqe2.com/ 网页分析:Goole Chrome 语句解析采用:XP... -
java爬虫前景如何_最近一直在做java爬虫,有些感悟心得,分享给大家;
2021-03-01 08:23:51首先,看完这篇文章,不能保证你...先上代码,在一步一步讲解:这是一个工具类,不用详细看,网上哪里都能找到发送http请求的工具类,少包自己导package com.df.util;import java.io.BufferedReader;import java.io... -
Java基础教程:JavaArrays工具类
2021-03-17 17:45:08今天为大家解读的是JavaArrays工具类 Java Arrays类是一个工具类,其中包含了数组操作的很多方法。这个Arrays类里均为static修饰的方法(static修饰的方法可以直接通过类名调用),可以直接通过Arrays.xxx(xxx)的... -
作为一个屌丝程序员不得不收藏的工具类 一 网站爬虫工具类
2014-10-30 10:02:11import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; impor -
Java/Android 多站点小说爬虫,制作成工具类,并发搜索下载,支持追更
2019-04-16 16:41:50简介:Java/Android 多站点小说爬虫,制作成工具类,并发搜索下载,支持追更 更多:作者提 Bug 标签: Java/Android 小说爬虫工具 使用简单的几行代码,打造你自己的小说开源软件,多站点解析,并发搜索下载... -
java做爬虫识别验证码的工具类,识别数字加字母的验证图片
2020-11-23 12:04:34我们都知道,验证码的作用是用来验证你是否为机器人,基本是做反爬虫或刷数据的一类功能验证。针对这种情况,引用一位老人家的名言,“要用魔法打败魔法”,我们想爬别人数据只能通过更高明的技术。本文介绍的也不是... -
JAVA爬虫详解
2018-10-17 21:33:36爬虫原理:我们一般访问网页时,都会把html源码下载到本地,因此我们就可以模拟网页的请求方式,将...所用到的工具类:URL:打开网页链接,即执行访问url的功能 URLConnection:获取访问后下载的html源代码 ... -
Java爬虫小练手
2020-02-23 01:34:58最近正好看了点关于Java爬虫的知识,主要看了 HttpClient 和 Jsoup的简单使用,爬虫框架的知识没怎么涉及,本身也是准备先熟悉 上面这两个工具,这样以后学习爬虫框架的知识就比较轻松了。 如果光... -
Java 爬虫实战二
2020-04-01 12:47:44一、项目需要 对某类网站的数据进行数据采集 对主要数据采集点每天定时开启任务 ... 爬虫工具:webmagic 数据库:Mysql 定时器:@Scheduled 开启 cron在心解析地址:http://cron.qqe2.com/ 网页分析:... -
java爬虫--jsoup
2019-09-10 20:46:24java爬虫02-----Jsoup 1、环境准备 IDE: IDEA Maven: Maven Jsoup1.10.2: jsoup junit:单元测试 commons-io: 操作文件io的 commons-lang3: 使用到工具类StringUtils pom.xml: <?xml version="1.0" encoding=...