精华内容
下载资源
问答
  • java 爬虫数据

    2020-06-05 17:20:33
    import java.io.File; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Date; ...
    package net.aykj.util;
    
    import java.io.File;
    import java.text.ParseException;
    import java.text.SimpleDateFormat;
    import java.util.ArrayList;
    import java.util.Arrays;
    import java.util.Collections;
    import java.util.Date;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import javax.servlet.ServletContext;
    
    import org.htmlparser.Parser;
    import org.htmlparser.Tag;
    import org.htmlparser.filters.TagNameFilter;
    import org.htmlparser.util.ParserException;
    import org.htmlparser.util.SimpleNodeIterator;
    
    import net.aykj.pojo.Annex;
    import net.aykj.pojo.Article;
    import net.aykj.pojo.Rule;
    import net.aykj.service.AnnexService;
    import net.aykj.service.ArticleService;
    import net.aykj.service.RuleService;
    import sun.security.x509.GeneralName;
    
    /**
     * 采集线程蜘蛛
     * Update by Bingyong.Wang    当详情页中没有显示时间时,采集列表页的时间
     */
    @SuppressWarnings({"unchecked", "unused"})
    public class SpiderThreadUtil extends Thread {
    	
    	private boolean stop = false;
    	private RuleService ruleService = null;
    	private ArticleService articleService = null;
    	private AnnexService annexService =null;
    	private ServletContext servletContext = null;
    	private Integer rulesId = null;
    	private String rulesName = null;
    	
    	public SpiderThreadUtil(RuleService ruleService, ArticleService articleService,AnnexService annexService, ServletContext servletContext, String rulesName, Integer rulesId) {
    		this.ruleService = ruleService;
    		this.articleService = articleService;
    		this.annexService=annexService;
    		this.servletContext = servletContext;
    		this.rulesId = rulesId;
    		this.rulesName = rulesName;
    	}
    	
    	@Override
    	public void run() {
    		try {
    			String totalCount = rulesName + "_totalCount";
    			// 采集的数量
    			String getCount = rulesName + "_getCount"; 
    			// 已经存在的数量
    			String gotCount = rulesName + "_gotCount"; 
    			String errorCount = rulesName + "_errorCount";
    			
    			List<Rule> ruleList =  ruleService.queryRuleListByRulesId(rulesId, true);
    			List<Rule> newRuleList = new ArrayList<Rule>();
    			if (ruleList != null && ruleList.size() > 0) {
    				int total = 0;
    				for (Rule rule : ruleList) {
    					// 查询详细页的连接  若详情页没时间时,列表页获取时间,及查询列表页时间,详情页连接与时间组合在一起,构成详情页链接,通过VT连接
    					List<String> viewUrlList = queryViewUrlList(rule, errorCount); 
    					
    					System.out.println("翻转的详情连接======" + viewUrlList);
    					
    					total = total + viewUrlList.size();					
    					System.out.println("-----------------------"+rule.getNewsClassId()+"栏目,查到" + total + "篇---------------------");
    					
    					// 需要采集的详细页的连接
    					rule.setViewUrlList(viewUrlList); 
    					newRuleList.add(rule);
    					
    					// 判断是否结束线程
    					if (isStop(totalCount, getCount, gotCount, errorCount)) break; 
    				}
    				// 设置总共需要采集的文章数量
    				servletContext.setAttribute(totalCount, total); 
    				
    				// 采集文章
    				catchArticles(newRuleList, totalCount, getCount, gotCount, errorCount); 
    				
    				System.out.println("----------------------- 数据采集完成,共采集到" + total + "篇 ----------------------");
    			}
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    	}
    	
    	private void catchArticles(List<Rule> newRuleList, String totalCount, String getCount, String gotCount, String errorCount) throws Exception {
    		// 详情链接
    		String viewUrlTemp = null;
    		// 创建时间
    		String createtime = null;
    		for (Rule rule : newRuleList) {
    			List<String> viewUrlList = (List<String>) rule.getViewUrlList();
    			if (viewUrlList != null) {
    				for (String viewUrl : viewUrlList) {
    					if (viewUrl.indexOf("VT") > 0) {
    						viewUrlTemp = viewUrl.split("VT")[0];
    						viewUrlTemp=viewUrlTemp.replaceAll("href=\"|\"", "");
    						createtime = viewUrl.split("VT")[1];
    					}else{
    						viewUrlTemp=viewUrlTemp.replaceAll("href=\"|\"", "");
    					}
    					if (isStop(totalCount, getCount, gotCount, errorCount)) break; //判断是否结束线程
    					System.out.println(viewUrlTemp);
    					
    					// 详情页链接及文章时间
    					if (GeneralUtil.isNotNull(viewUrlTemp) && GeneralUtil.isNotNull(createtime)) {
    						catchArticleByViewUrlAndCreatetime(rule, viewUrlTemp, createtime, totalCount, getCount, gotCount, errorCount);
    					} else {
    						catchArticle(rule, viewUrlTemp, totalCount, getCount, gotCount, errorCount);
    					}
    				}
    			}
    		}
    	}
    	
    
    	private void catchArticle(Rule rule, String viewUrl, String totalCount, String getCount, String gotCount, String errorCount) throws Exception {
    		String encode = rule.getEncode();
    		encode = encode == null ? "UTF-8" : encode;
    		String content = HttpUtil.get(viewUrl, encode);
    		if("HTTP/1.1 404 Not Found".equals(content)){
    			System.out.println("详细页链接:"+viewUrl+",访问404,跳过");
    			this.addErrorCount(errorCount);
    		}else{
    			String host = rule.getHost();
    			
    			String titleRegex = rule.getTitleRegex();
    			String authorRegex = rule.getAuthorRegex();
    			String createtimeRegex = rule.getCreatetimeRegex();
    			String sourceRegex = rule.getSourceRegex();
    			String hitsRegex = rule.getHitsRegex();
    			String contentRegex = rule.getContentRegex();
    			
    			String titleFilterRegex = rule.getTitleFilterRegex();
    			String authorFilterRegex = rule.getAuthorFilterRegex();
    			String createtimeFilterRegex = rule.getCreatetimeFilterRegex();
    			String sourceFilterRegex = rule.getSourceFilterRegex();
    			String hitsFilterRegex = rule.getHitsFilterRegex();
    			String contentFilterRegex = rule.getContentFilterRegex();
    			
    			Integer subsiteId = rule.getSubsiteId();
    			Integer objId = rule.getId();
    			Integer newsClassId = rule.getNewsClassId();
    			
    			List<String> titleList = this.extractStrByPattern(content, titleRegex, errorCount, true, titleFilterRegex);
    			List<String> authorList = this.extractStrByPattern(content, authorRegex, errorCount, true, authorFilterRegex);
    			List<String> createtimeList = this.extractStrByPattern(content, createtimeRegex, errorCount, true, createtimeFilterRegex);
    			List<String> sourceList = this.extractStrByPattern(content, sourceRegex, errorCount, true, sourceFilterRegex);
    			List<String> hitsList = this.extractStrByPattern(content, hitsRegex, errorCount, true, hitsFilterRegex);
    			List<String> contentList = this.extractStrByPattern(content, contentRegex, errorCount, false, contentFilterRegex);
    			
    			String title = titleList!=null && titleList.size()>0 ? titleList.get(0).trim(): null;
    			// 没有名称的说明没有采集到  跳过
    			if(GeneralUtil.isNotNull(title)){
    				String createtimeStr = createtimeList!=null && createtimeList.size()>0 ? createtimeList.get(0): null;
    				
    				// 获取时间
    				if (createtimeStr.contains("发布时间")) {
    					createtimeStr = createtimeStr.substring(createtimeStr.indexOf("发布时间"), createtimeStr.indexOf("作者")).replace("发布时间:", "").trim();
    				}
    				// 获取文章来源
    				String source = sourceList!=null && sourceList.size()>0 ? sourceList.get(0): "原创";
    				if (source.contains("新闻来源")) {
    					source = source.substring(source.indexOf("新闻来源:"), source.indexOf("新闻来源")).replace("新闻来源:", "").trim();
    					source = null != source && source != " " && source.length() == 2 ? source : "原创";
    				}
    				
    				// 作者
    				String author = authorList!=null && authorList.size()>0 ? authorList.get(0).trim(): "管理员";
    				if (author.contains("作者")) {
    					author = author.substring(author.indexOf("作者"), author.indexOf("作者")).replace("作者:", "").trim();
    					System.out.println(source.length());
    					author = null != author && author != " " && author.length() == 2 ? author : "管理员";
    				}
    				
    				String hits = hitsList!=null && hitsList.size()>0 ? hitsList.get(0).trim(): "0";
    				String articleContent = contentList!=null && contentList.size()>0 ? contentList.get(0): null;
    				
    				//处理采集到的时间
    				Date createtime = null;
    				if (createtimeStr == null) {
    					createtime = new Date();
    				} else {
    //					System.out.println(createtimeStr);
    					createtimeStr=createtimeStr.replaceAll("\\r.*\\n", "").trim();
    //					createtimeStr=createtimeStr.replaceAll("\u4E00-\u9FFF", "");
    					//System.out.println(createtimeStr);
    					String timeFormat = rule.getTimeFormat();
    					if (timeFormat != null && !"".equals(timeFormat)) {
    						SimpleDateFormat sdf = new SimpleDateFormat(timeFormat);
    						createtime = sdf.parse(createtimeStr);
    					}
    				}
    				
    				if (articleExist(title, subsiteId, newsClassId)) {
    					this.addGotCount(gotCount);
    					return;
    				}
    				
    				//采集图片
    				// articleContent = catchImage(articleContent, host);
    				articleContent = catchImageByViewUrl(articleContent, viewUrl);
    				Integer aid=addArticle(title, author, createtime, source, articleContent, subsiteId, newsClassId, hits);
    				//创建一个缩略图
    				String imgRegex="/static/upload.*?pdf";//获取一个缩略图的正则
    				Pattern pattern = Pattern.compile(imgRegex, Pattern.DOTALL);
    				Matcher matcher = pattern.matcher(articleContent);
    				String imgSrc="";
    				while(matcher.find()){
    					imgSrc=matcher.group();
    					break;
    				}
    				if(GeneralUtil.isNotNull(imgSrc)){
    					String annexPath=downloadFile("http://www.ynsap.org.cn/"+imgSrc);
    					Annex annex = new Annex();
    					annex.setPath(annexPath);
    					annex.setName(imgSrc.substring(imgSrc.lastIndexOf("/")+1));
    					annex.setExt("pdf");
    					annex.setType("annex");
    					annex.setObj("article");
    					annex.setCreatetime(createtime);
    					annex.setObjId(aid);
    					annexService.save(annex);
    				}
    				
    				this.addGetCount(getCount);
    			}else{
    				System.out.println("详细页链接:"+viewUrl+",采集到的标题为空");
    				this.addErrorCount(errorCount);
    			}
    		}
    	}
    	
    	/**
    	 * 根据文章详情页链接爬取文章,文章时间已获取有
    	 * @param rule
    	 * @param viewUrl
    	 * @param createtimeTemp
    	 * @param totalCount
    	 * @param getCount
    	 * @param gotCount
    	 * @param errorCount
    	 * @throws Exception
    	 * void
    	 * Bingyong.Wang at 2019年12月5日
    	 */
    	private void catchArticleByViewUrlAndCreatetime(Rule rule, String viewUrl, String createtimeTemp, String totalCount, String getCount, String gotCount, String errorCount) throws Exception {
    		String encode = rule.getEncode();
    		encode = encode == null ? "UTF-8" : encode;
    		String content = HttpUtil.get(viewUrl, encode);
    		if("HTTP/1.1 404 Not Found".equals(content)){
    			System.out.println("详细页链接:"+viewUrl+",访问404,跳过");
    			this.addErrorCount(errorCount);
    		}else{
    			String host = rule.getHost();
    			
    			String titleRegex = rule.getTitleRegex();
    			String authorRegex = rule.getAuthorRegex();
    			String createtimeRegex = rule.getCreatetimeRegex();
    			String sourceRegex = rule.getSourceRegex();
    			String hitsRegex = rule.getHitsRegex();
    			String contentRegex = rule.getContentRegex();
    			
    			String titleFilterRegex = rule.getTitleFilterRegex();
    			String authorFilterRegex = rule.getAuthorFilterRegex();
    			String createtimeFilterRegex = rule.getCreatetimeFilterRegex();
    			String sourceFilterRegex = rule.getSourceFilterRegex();
    			String hitsFilterRegex = rule.getHitsFilterRegex();
    			String contentFilterRegex = rule.getContentFilterRegex();
    			
    			Integer subsiteId = rule.getSubsiteId();
    			Integer objId = rule.getId();
    			Integer newsClassId = rule.getNewsClassId();
    			
    			List<String> titleList = this.extractStrByPattern(content, titleRegex, errorCount, true, titleFilterRegex);
    			List<String> authorList = this.extractStrByPattern(content, authorRegex, errorCount, true, authorFilterRegex);
    			List<String> createtimeList = this.extractStrByPattern(content, createtimeRegex, errorCount, true, createtimeFilterRegex);
    			List<String> sourceList = this.extractStrByPattern(content, sourceRegex, errorCount, true, sourceFilterRegex);
    			List<String> hitsList = this.extractStrByPattern(content, hitsRegex, errorCount, true, hitsFilterRegex);
    			List<String> contentList = this.extractStrByPattern(content, contentRegex, errorCount, false, contentFilterRegex);
    			
    			String title = titleList!=null && titleList.size()>0 ? titleList.get(0).trim(): null;
    			// 没有名称的说明没有采集到  跳过
    			if(GeneralUtil.isNotNull(title)){
    				
    				/* -----------   高检 采集 start---  */
    				// 获取时间
    				/*if (createtimeStr.contains("时间")) {
    					createtimeStr = createtimeStr.substring(createtimeStr.indexOf("时间"), createtimeStr.indexOf("作者")).replace("时间:", "").trim();
    				}
    				// 获取文章来源
    				String source = sourceList!=null && sourceList.size()>0 ? sourceList.get(0): "网站原创";
    				if (source.contains("来源")) {
    					source = null != source.substring(source.indexOf("来源"), source.length()).replace("来源:", "").trim() 
    							&& "" != source.substring(source.indexOf("来源"), source.length()).replace("来源:", "").trim() 
    							? source.substring(source.indexOf("来源"), source.length()).replace("来源:", "").trim() : "网站原创";
    				}*/
    				/* -----------   高检 采集    end---  */
    				
    				/* -----------   云检 采集 start---  */
    				
    				// 获取文章来源
    				String source = sourceList!=null && sourceList.size()>0 ? sourceList.get(0): "原创";
    				if (source.contains("来源")) {
    					source = source.substring(source.indexOf("来源:"), source.indexOf("查看")).replace("来源:", "").replace("\r\n\t\t\t\t", "").trim();
    					source = null != source && source != " " && source.length() == 2 ? source : "原创";
    				}
    				/* -----------   云检 采集    end---  */
    				
    				// 作者
    				String author = authorList!=null && authorList.size()>0 ? authorList.get(0).trim(): "管理员";
    				if (author.contains("作者")) {
    					author = author.substring(author.indexOf("作者"), author.indexOf("作者")).replace("作者:", "").trim();
    					System.out.println(source.length());
    					author = null != author && author != " " && author.length() == 2 ? author : "管理员";
    				}
    				
    				String hits = hitsList!=null && hitsList.size()>0 ? hitsList.get(0).trim(): "0";
    				String articleContent = contentList!=null && contentList.size()>0 ? contentList.get(0): null;
    				
    				//处理采集到的时间
    				Date createtime = null;
    				if (GeneralUtil.isNull(createtimeTemp)) {
    					createtime = new Date();
    				} else {
    //					System.out.println(createtimeStr);
    					createtimeTemp = createtimeTemp.replaceAll("\\r.*\\n", "").trim();
    //					createtimeStr=createtimeStr.replaceAll("\u4E00-\u9FFF", "");
    					//System.out.println(createtimeStr);
    					String timeFormat = rule.getTimeFormat();
    					if (timeFormat != null && !"".equals(timeFormat)) {
    						SimpleDateFormat sdf = new SimpleDateFormat(timeFormat);
    						createtime = sdf.parse(createtimeTemp);
    					}
    				}
    				
    				if (articleExist(title, subsiteId, newsClassId)) {
    					this.addGotCount(gotCount);
    					return;
    				}
    				
    				//采集图片
    				// articleContent = catchImage(articleContent, host);
    				articleContent = catchImageByViewUrl(articleContent, viewUrl);
    				Integer aid=addArticle(title, author, createtime, source, articleContent, subsiteId, newsClassId, hits);
    				//创建一个缩略图
    				String imgRegex="/static/upload.*?pdf";//获取一个缩略图的正则
    				Pattern pattern = Pattern.compile(imgRegex, Pattern.DOTALL);
    				Matcher matcher = pattern.matcher(articleContent);
    				String imgSrc="";
    				while(matcher.find()){
    					imgSrc=matcher.group();
    					break;
    				}
    				if(GeneralUtil.isNotNull(imgSrc)){
    					String annexPath=downloadFile("http://www.ynsap.org.cn/"+imgSrc);
    					Annex annex = new Annex();
    					annex.setPath(annexPath);
    					annex.setName(imgSrc.substring(imgSrc.lastIndexOf("/")+1));
    					annex.setExt("pdf");
    					annex.setType("annex");
    					annex.setObj("article");
    					annex.setCreatetime(createtime);
    					annex.setObjId(aid);
    					annexService.save(annex);
    				}
    				
    				this.addGetCount(getCount);
    			}else{
    				System.out.println("详细页链接:"+viewUrl+",采集到的标题为空");
    				this.addErrorCount(errorCount);
    			}
    		}
    	}
    	
    	private String catchImage(String articleContent, String host) throws ParserException, InterruptedException {
    		String html = "<html>" +  articleContent + "</html>";
    		Parser parser = new Parser (html);
    		SimpleNodeIterator nodeList = parser.extractAllNodesThatMatch(new TagNameFilter("img")).elements();
    		while (nodeList.hasMoreNodes()) {
    			Tag tag = (Tag) nodeList.nextNode();
    			String src = tag.getAttribute("src");
    			if (src != null) {
    				String canDownloadSrc = src.replace("\\", "/");
    				if (canDownloadSrc.startsWith("/")) {
    					canDownloadSrc = host + canDownloadSrc;
    				} else if (!canDownloadSrc.startsWith("/") && !canDownloadSrc.startsWith("http://")) {
    					canDownloadSrc =host + "/" + canDownloadSrc;
    				}
    				
    				String newSrc = downloadFile(canDownloadSrc);
    				html = html.replace(src, newSrc);
    			}
    		}
    		html =  html.substring("<html>".length());
    		html = html.substring(0, html.length() - "</html>".length());
    		return html;
    	}
    	
    	/**
    	 * 根据文章详情页连接获取文章中图片
    	 * @param articleContent
    	 * @param viewUrl
    	 * @return
    	 * @throws ParserException
    	 * @throws InterruptedException
    	 * String
    	 * Bingyong.Wang at 2019年8月8日
    	 */
    	private String catchImageByViewUrl(String articleContent, String viewUrl) throws ParserException, InterruptedException {
    		String html = "<html>" +  articleContent + "</html>";
    		Parser parser = new Parser (html);
    		SimpleNodeIterator nodeList = parser.extractAllNodesThatMatch(new TagNameFilter("img")).elements();
    		while (nodeList.hasMoreNodes()) {
    			Tag tag = (Tag) nodeList.nextNode();
    			String src = tag.getAttribute("src");
    			if (src != null) {
    				String canDownloadSrc = src.replace("\\", "/");
    				if (canDownloadSrc.startsWith("/")) {
    					canDownloadSrc = viewUrl.substring(0, viewUrl.lastIndexOf("/") + 1) + canDownloadSrc;
    				} else if (!canDownloadSrc.startsWith("/") && !canDownloadSrc.startsWith("http://")) {
    					canDownloadSrc = viewUrl.substring(0, viewUrl.lastIndexOf("/") + 1) + canDownloadSrc;
    				}
    				
    				String newSrc = downloadFile(canDownloadSrc);
    				html = html.replace(src, newSrc);
    			}
    		}
    		html =  html.substring("<html>".length());
    		html = html.substring(0, html.length() - "</html>".length());
    		return html;
    	}
    	
    	private String downloadFile(String src) throws InterruptedException {
    		Thread.sleep(500);
    		String ext = src.substring(src.lastIndexOf(".") + 1);
    		String fileName = System.currentTimeMillis() + "." + ext;
    		String localFile =  net.aykj.listener.InitialListener.basePath + "temp/" + fileName;
    		HttpUtil.downloadFile(src, localFile);
    		return "/temp/" + fileName;
    	}
    	
    	@SuppressWarnings("rawtypes")
    	private boolean articleExist(String title, Integer subsiteId, Integer newsClassId) {
    		//判断文章是否存在
    		Map condition = new HashMap();
    		condition.put("title", title);
    		condition.put("subsiteId", subsiteId);
    		condition.put("newsClassIds", newsClassId);
    		Long count = articleService.queryArticleCountByTitle(subsiteId, newsClassId, title);
    		if (count > 0) {
    			return true;
    		}
    		return false;
    	}
    	
    	/**
    	 * 保存文章
    	 * @param title
    	 * @param author
    	 * @param createtime
    	 * @param source
    	 * @param articleContent
    	 * @param subsiteId
    	 * @param newsClassId
    	 * @param hits
    	 * @throws Exception
    	 */
    	private Integer addArticle(String title, String author, Date createtime, String source, String articleContent, 
    			Integer subsiteId, Integer newsClassId, String hits) throws Exception {
    		Article article = new Article();
    		article.setTitle(title);
    		article.setAuthor(author);
    		article.setCreatetime(createtime);
    		article.setSource(source);
    		article.setContent(articleContent);
    		article.setAudit(1);
    		article.setHits(hits == null || "".equals(hits) ? 0 : Integer.valueOf(hits));
    		return articleService.saveArticle(article, null, new Integer[]{newsClassId});
    	}
    
    	/**
    	 * 查询详细页的连接
    	 * @param rule
    	 * @param errorCount
    	 * @return
    	 * @throws Exception
    	 */
    	private List<String> queryViewUrlList(Rule rule, String errorCount) throws Exception {
    		String listUrl = rule.getListUrl();
    		String encode =  GeneralUtil.isNull(rule.getEncode()) ? "UTF-8" : rule.getEncode();
    		String viewRegex = rule.getViewRegex();
    		
    		// 列表时间正则表达式
    		String listCreatetimeRegex = rule.getListCreatetimeRegex();
    		
    		// 列表时间过滤器
    		String listCreatetimeFilter = rule.getListCreatetimeFilter();
    		
    		String host = rule.getHost();
    		// 云检
    		//host = "http://" + host + "/";
    		// 高检
    		// host = "http:";
    		if (listUrl != null) {
    			String[] listUrlArray = listUrl.split(",");
    			List<String> viewUrlList = new ArrayList<String>();
    			List<String> viewUrlTemp = new ArrayList<String>();
    			for (String url : listUrlArray) {
    				String content = HttpUtil.get(url, encode);
    				if("HTTP/1.1 404 Not Found".equals(content)){
    					System.out.println("链接:"+host+",访问404,请检查链接");
    				}else{
    					if (GeneralUtil.isNotNull(rule.getListContainerRegex())) {
    						List<String> contentList = extractStrByPattern(host, content, rule.getListContainerRegex(), errorCount);
    						if (contentList != null && contentList.size() > 0) {
    							content = contentList.get(0);
    						}
    					}
    					
    					//详细页连接在部分网站没有写绝对路径  这里要拼出完整的连接前缀
    					List<String> list = extractStrByPattern((GeneralUtil.isNotNull(rule.getPrefix()) ? rule.getPrefix() : "" ), content, viewRegex, errorCount);
    					
    					// 获取列表页时间
    					List<String> createtimeList = extractStrByPattern(content, listCreatetimeRegex, errorCount, true, listCreatetimeFilter);
    					System.out.println(createtimeList);
    					
    					// 这里如果详情页没有时间,需从列表页获取时间时使用。  思想:把列表页详情链接和时间绑在一起用VT分隔
    					if (GeneralUtil.isNotNull(createtimeList)) {
    						int i = 0;
    						for (String viewList : list) {
    							List<String> vListTemp = new ArrayList<String>();
    							vListTemp.add(viewList + "VT" + (createtimeList.get(i).contains("\r\n\t\t\t\t\t\t\t\t") ? createtimeList.get(i).replace("\r\n\t\t\t\t\t\t\t\t", "") : createtimeList.get(i)));
    							i++;
    							viewUrlTemp.addAll(vListTemp);
    						}
    						//详细页连接进行倒叙  插入数据才是正着的
    						Collections.reverse(viewUrlTemp);
    						viewUrlList.addAll(viewUrlTemp);
    					} else {
    						//详细页连接进行倒叙  插入数据才是正着的
    						Collections.reverse(list);
    						viewUrlList.addAll(list);
    					}
    					
    				}
    			}
    			return viewUrlList;
    		}
    		return null;
    	}
    	
    	
    	private List<String> extractStrByPattern(String content, String regex, String errorCount) {
    		return extractStrByPattern(null, content, regex, errorCount);
    	}
    	
    	private List<String> extractStrByPattern(String prefix, String content, String regex, String errorCount) {
    		return extractStrByPattern(prefix, content, regex, errorCount, false, null);
    	}
    	
    	private List<String> extractStrByPattern(String content, String regex, String errorCount, boolean filterHtml) {
    		return extractStrByPattern(null, content, regex, errorCount, filterHtml, null);
    	}
    	
    	private List<String> extractStrByPattern(String content, String regex, String errorCount, boolean filterHtml, String filterRegex) {
    		return extractStrByPattern(null, content, regex, errorCount, filterHtml, filterRegex);
    	}
    	
    	private List<String> extractStrByPattern(String prefix, String content, String regex, String errorCount, boolean filterHtml, String filterRegex) {
    		if(GeneralUtil.isNotNull(regex)){
    			
    			List<String> list = new ArrayList<String>();
    			Pattern pattern = Pattern.compile(regex, Pattern.DOTALL);
    			Matcher matcher = pattern.matcher(content);
    			boolean isFound = false;
    			while(matcher.find()) {
    				isFound = true;
    				String g = matcher.group();
    				
    				if (filterHtml) {
    					g = g.replaceAll("^[\u00ff\uffff]", "").replaceAll("<.*?>", "").replaceAll("&.*?;", "");
    				}
    				
    				if (GeneralUtil.isNotNull(filterRegex)) {
    					g = g.replaceAll(filterRegex, "");
    				}
    				
    				if (GeneralUtil.isNotNull(prefix)) {
    					list.add(prefix + g);
    				} else {
    					list.add(g);
    				}
    			}
    			if (!isFound) {
    				addErrorCount(errorCount);
    			}
    			return list;
    		}else{
    			return null;
    		}
    	}
    	
    	
    	private void addGetCount(String getCount) {
    		Integer count = servletContext.getAttribute(getCount) ==  null ? 0 : (Integer)servletContext.getAttribute(getCount);
    		count++;
    		servletContext.setAttribute(getCount, count);
    	}
    	
    	private void addGotCount(String gotCount) {
    		Integer count = servletContext.getAttribute(gotCount) ==  null ? 0 : (Integer)servletContext.getAttribute(gotCount);
    		count++;
    		servletContext.setAttribute(gotCount, count);
    	}
    	
    	private void addErrorCount(String errorCount) {
    		Integer count = servletContext.getAttribute(errorCount) ==  null ? 0 : (Integer)servletContext.getAttribute(errorCount);
    		count++;
    		servletContext.setAttribute(errorCount, count);
    	}
    	
    	private boolean isStop(String totalCount, String getCount, String gotCount, String errorCount)	{
    		if (stop) {
    			servletContext.removeAttribute(rulesName);
    			servletContext.removeAttribute(totalCount);
    			servletContext.removeAttribute(getCount);
    			servletContext.removeAttribute(gotCount);
    			servletContext.removeAttribute(errorCount);
    			return true;
    		}
    		return false;
    	}
    	
    	public void clear(String totalCount, String getCount, String gotCount, String errorCount) {
    		this.stop = true;
    		isStop(totalCount, getCount, gotCount, errorCount);
    	}
    
    	public boolean isStop() {
    		return stop;
    	}
    
    	public void setStop(boolean stop) {
    		this.stop = stop;
    	}
    	
    	
    	
    	public static void main(String[] args) throws ParseException {
    		String content = FileUtil.readFileToString(new File("F:\\test.txt"), "UTF-8");
    		String regex = "E_ReadNews.asp\\?NewsID=[0-9]*";
    		Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE);
    		Matcher matcher = pattern.matcher(content);
    		while(matcher.find()) {
    			String g = matcher.group();
    			System.out.println(g);
    		}
    	}
    	
    }
    
    /** 下载文件方法 */
    public static String downloadFile(String remoteFile, String localFile) {
    		//匹配正则表达式  带中文的替换成编译过的
    		String zwRegex = "[\u4e00-\u9fa5]";
    		Pattern pattern = Pattern.compile(zwRegex, Pattern.DOTALL);
    		Matcher matcher = pattern.matcher(remoteFile);
    		String newRemoteFile = remoteFile;
    		while(matcher.find()){
    			String zw = matcher.group();
    			try {
    				String zwbm = URLEncoder.encode(zw, "utf-8");
    				newRemoteFile = newRemoteFile.replaceAll(zw, zwbm);
    			} catch (UnsupportedEncodingException e) {
    				// TODO Auto-generated catch block
    				e.printStackTrace();
    			}
    		}
    		remoteFile = newRemoteFile;
    		//处理完毕
    		
            FileOutputStream output = null;
            String message = null;
            GetMethod get = null;
            try {
            	HttpClient client = new HttpClient();  
                get = new GetMethod(remoteFile); 
    			client.executeMethod(get);
    			
    			localFile = localFile.replace("\\", "/");
    			String dirStr = localFile.substring(0, localFile.lastIndexOf("/"));
    			File dirFile = new File(dirStr);
    			if (!dirFile.exists()) dirFile.mkdirs();
    			
    			File storeFile = new File(localFile);  
    	        output = new FileOutputStream(storeFile);  
    	        output.write(get.getResponseBody());  
    	        if (get.getStatusCode() != 200) {
    	        	message = get.getStatusText();
    	        } else {
    	        	message =get.getStatusCode() + "";
    	        }
    		} catch (HttpException e) {
    			e.printStackTrace();
    		} catch (IOException e) {
    			e.printStackTrace();
    		} finally {
    			 try {
    				 if (output != null) {
    					 output.flush(); 
    					 output.close();
    				 }
    				 if (get != null) get.abort();
    			} catch (IOException e) {
    				e.printStackTrace();
    			} 
    		}
    		return message;
    	}

     

    展开全文
  • 主要介绍了Java 爬虫遇上数据异步加载,试试这两种办法!问题如何解决,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下
  • java完整的网络数据收集,包括界面、日志输出、配置等等功能,可以自行扩充。eclipse可以直接打开运行。
  • 上篇分析的网站是国家级,没有真正编写代码爬取对应的数据,今天以“1药网”为例来药品数据 https://www.111.com.cn/ 1、 分析网站 进入网站首页 2、 点击一下“所有商品分类”,对应的网站地址如下 ...

    上篇分析的网站是国家级,没有真正编写代码爬取对应的数据,今天以“1药网”为例来爬一爬药品数据

    https://www.111.com.cn/
    

    1、 分析网站
    进入网站首页

    在这里插入图片描述
    2、 点击一下“所有商品分类”,对应的网站地址如下

    https://www.111.com.cn/categories/ 
    

    在这里插入图片描述
    先爬取所有的“分类”,然后在根据“分类”获取分类下的所有商品。
    发起Get请求的方法

    public static String sendGet(String url){
        String result = null;
        //Creates CloseableHttpClient instance with default configuration.
        CloseableHttpClient httpCilent = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(url);
        try {
            CloseableHttpResponse response = httpCilent.execute(httpGet);
            String result = EntityUtils.toString(response.getEntity()) ;
            System.out.println(result);
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                httpCilent.close();//释放资源
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return result;
    }
    

    获取“药品分类”的HTML页面

    public static  String getCategories(){
        String html =  HttpUtils.sendGet("https://www.111.com.cn/categories/");
        return html;
    }
    

    3、 分析爬取到的药品分类的HTML,认真看图
    在这里插入图片描述
    把界面上所有的分类链接解析出来
    4、 解析所有分类链接

    public static List<String> processCategoriesHtml(String html){
        List<String> links = new ArrayList<String>();
        if(html!=null && !"".equals(html.trim())){
            try {
                Parser parser = new Parser(html);
                //定义一个Filter,过滤主题a
                NodeFilter afilter = new NodeClassFilter(LinkTag.class);//A过滤器
                NodeList nodeList =  parser.extractAllNodesThatMatch(afilter);
                for(int i=0; i<nodeList.size(); i++){
                    Node aNode =  nodeList.elementAt(i);
                    LinkTag aLinkTag =  (LinkTag)aNode;
                    if(aLinkTag.getLink()!=null && aLinkTag.getLink().contains("categories")){
                        links.add(aLinkTag.getLink());
                        System.out.println(aLinkTag.getLink());
                    }
                }
            }catch (Exception e){
                e.printStackTrace();
            }
    
        }
        return links;
    }
    

    5、 下面来看看“杜蕾斯”的兄弟没到底有多少
    在第五步爬出来的连接有如下

    //www.111.com.cn/categories/965327-j1
    

    一个连接,同样先爬取HTML页面

    public static String getDLS_Html(){
        String html =  HttpUtils.sendGet("https://www.111.com.cn/categories/965327-j1");
        return html;
    }
    

    分析获取到HTML
    在这里插入图片描述
    每个“杜大哥”的连接都有product和class=“product_pic pro_img”。用如下代码即可获取所有的“杜蕾斯”兄弟的商品链接

    public static List<String> processDLSHtml(String html){
        List<String> links = new ArrayList<String>();
        if(html!=null && !"".equals(html.trim())){
            try {
                Parser parser = new Parser(html);
                //定义一个Filter,过滤主题em
                NodeFilter afilter = new NodeClassFilter(LinkTag.class);//A过滤器
                NodeList nodeList =  parser.extractAllNodesThatMatch(afilter);
                for(int i=0; i<nodeList.size(); i++){
                    Node aNode =  nodeList.elementAt(i);
                    LinkTag aLinkTag =  (LinkTag)aNode;
                    if(aLinkTag.getLink()!=null && aLinkTag.getLink().contains("product")){
                        links.add(aLinkTag.getLink());
                        System.out.println(aLinkTag.getLink());
                    }
                }
            }catch (Exception e){
                e.printStackTrace();
            }
    
        }
        return links;
    }
    

    7、 接下来就可以获取所有的商品规格了,哪款卖的多都可以分析出来哦
    同样先获取页面详情

    public static String getDLSDetail_Html(String url){
        String html =  HttpUtils.sendGet("https:"+url);
        return html;
    }
    

    然后分析HTML可以知道只要分析下图这个div的信息就可以获取“杜大哥”的所有信息了。这个留给大家分析分析下看看怎获取。
    在这里插入图片描述

    展开全文
  • 阅读文本大概需要6分钟。上篇分析的网站是国家级,没有真正编写代码爬取对应的数据,今天以“1药网”为例来药品数据https://www.111.com...
        

    阅读文本大概需要6分钟。

    上篇分析的网站是国家级,没有真正编写代码爬取对应的数据,今天以“1药网”为例来爬一爬药品数据

            https://www.111.com.cn/

    1、分析网站

    进入网站首页

    640?wx_fmt=png


    2、点击一下“所有商品分类”,对应的网站地址如下

    https://www.111.com.cn/categories/

    640?wx_fmt=png

    先爬取所有的“分类”,然后在根据“分类”获取分类下的所有商品。

    发起Get请求的方法

    public static String sendGet(String url){
        String result =
    null;
       
    //CreatesCloseableHttpClient instance with default configuration.
       
    CloseableHttpClienthttpCilent = HttpClients.createDefault();
       
    HttpGethttpGet = new HttpGet(url);
        try
    {
            CloseableHttpResponse response =httpCilent.execute(httpGet)
    ;
           
    Stringresult = EntityUtils.toString(response.getEntity()) ;
           
    System.out.println(result);
       
    }catch (IOException e) {
            e.printStackTrace()
    ;
       
    }finally {
           
    try{
                httpCilent.close()
    ;//释放资源
           
    }catch (IOException e) {
                e.printStackTrace()
    ;
           
    }
        }
       
    returnresult;
    }

    获取“药品分类”的HTML页面

    public static  String getCategories(){

        String html =  HttpUtils.sendGet(
    "https://www.111.com.cn/categories/");

        return
    html;

    }


    3、分析爬取到的药品分类的HTML,认真看图

    640?wx_fmt=png

    把界面上所有的分类链接解析出来


    4、解析所有分类链接

    5、     public static List<String> processCategoriesHtml(String html){

        List<String> links =
    new ArrayList<String>();

        if
    (html!=null && !"".equals(html.trim())){

           
    try {

                Parser parser =
    new Parser(html);

               
    //定义一个Filter,过滤主题a

               
    NodeFilter afilter = new NodeClassFilter(LinkTag.class);//A过滤器

               
    NodeList nodeList =  parser.extractAllNodesThatMatch(afilter);

                for
    (int i=0; i<nodeList.size(); i++){

                    Node aNode =  nodeList.elementAt(i)
    ;

                   
    LinkTag aLinkTag =  (LinkTag)aNode;

                    if
    (aLinkTag.getLink()!=null && aLinkTag.getLink().contains("categories")){

                        links.add(aLinkTag.getLink())
    ;

                       
    System.out.println(aLinkTag.getLink());

                   
    }

                }

            }
    catch (Exception e){

                e.printStackTrace()
    ;

           
    }



        }

       
    return links;

    }

     

    6、下面来看看“杜蕾斯”的兄弟没到底有多少

    在第五步爬出来的连接有如下

    //www.111.com.cn/categories/965327-j1

    一个连接,同样先爬取HTML页面

    public static String getDLS_Html(){

        String html =  HttpUtils.sendGet(
    "https://www.111.com.cn/categories/965327-j1");

        return
    html;

    }

    分析获取到HTML

    640?wx_fmt=png

    每个“杜大哥”的连接都有productclass="product_pic pro_img"。用如下代码即可获取所有的“杜蕾斯”兄弟的商品链接

    public staticList<String>processDLSHtml(String html){

        List<String> links =
    new ArrayList<String>();

        if
    (html!=null && !"".equals(html.trim())){

           
    try {

                Parser parser =
    new Parser(html);

               
    //定义一个Filter,过滤主题em

               
    NodeFilter afilter = new NodeClassFilter(LinkTag.class);//A过滤器

                
    NodeList nodeList =  parser.extractAllNodesThatMatch(afilter);

                for
    (int i=0; i<nodeList.size(); i++){

                    Node aNode =  nodeList.elementAt(i)
    ;

                   
    LinkTag aLinkTag =  (LinkTag)aNode;

                    if
    (aLinkTag.getLink()!=null && aLinkTag.getLink().contains("product")){

                        links.add(aLinkTag.getLink())
    ;

                       
    System.out.println(aLinkTag.getLink());

                   
    }

                }

            }
    catch (Exception e){

                e.printStackTrace()
    ;

           
    }



        }

       
    return links;

    }

     

    7、接下来就可以获取所有的商品规格了,哪款卖的多都可以分析出来哦

    同样先获取页面详情

    public staticStringgetDLSDetail_Html(String url){

        String html =  HttpUtils.sendGet(
    "https:"+url);

        return
    html;

    }

    然后分析HTML可以知道只要分析下图这个div的信息就可以获取“杜大哥”的所有信息了。这个留给大家分析分析下看看怎获取。

    640?wx_fmt=png

    往期精彩


    01 漫谈发版哪些事,好课程推荐

    02 Linux的常用最危险的命令

    03 精讲Spring&nbsp;Boot—入门+进阶+实例

    04 优秀的Java程序员必须了解的GC哪些

    05 互联网支付系统整体架构详解


    关注我

    每天进步一点点

    640?wx_fmt=jpeg

    很干!在看吗?

    展开全文
  • 搭建环境需要用到的所有资源的百度云链接:http://pan.baidu.com/s/1kVLuSdD 密码:uek3打开eclipse 创建一个java project新建好之后,选中刚刚新建的项目,右击选择buildpath-configureBuildPath:添加jar包文件...

    链接:https://pan.baidu.com/s/1ud23RptUoqAh798INliK4g 密码:hiol

    谷歌浏览器驱动地址

    链接:https://pan.baidu.com/s/1S3XmXmhW7_r5f-mY99Xv7A 密码:aryd

    以下是步骤:

    打开eclipse 创建一个java project


    新建好之后,选中刚刚新建的项目,右击选择buildpath-configureBuildPath:


    添加jar包文件


    导入这三个部分的包(以selenium2.4.0为例):1:selenium-2.40.0\libs下面所有jar包


    全选这些文件,然后导入,导入完成后再次点击Add ExternalJARS,导入第二个部分的包:selenium-java-2.40.0.jar


    导入完成后,再次点击Add ExternalJARS,导入第三个部分的包:selenium-server-standalone-2.40.0.jar

    Eclipse 中代码调试

    public static void main(String[] args) throws InterruptedException {
    System.setProperty("webdriver.chrome.driver","E:\\webDriver\\chromedriver5.exe");//驱动放置位置
    WebDriver driver=new ChromeDriver(); 
    driver.navigate().to("http://www.baidu.com");
    driver.findElement(By.id("kw")).sendKeys(new  String[] {"百度公司"});
    WebElement element = driver.findElement(By.xpath("//*[@id='su']"));
    element.click(); 
    TimeUnit.SECONDS.sleep(1);
    //WebElement element2 = driver.findElement(By.linkText("百度在线网络技术(北京)有限公司_百度百科"));
    WebElement element2=driver.findElement(By.xpath("//*[@id='3']/h3/a"));
    element2.click();
    TimeUnit.SECONDS.sleep(1);
    //Actions action = new Actions(driver);
    //action.keyDown(Keys.CONTROL).sendKeys(Keys.TAB).perform();

    }



    展开全文
  • 我想用java爬点我们公司上的数据:现在目前的实现代码是这样的: ``` try { URL url = new URL("http://192.168.10.17:8080/xxxx");//小弟就不暴露公司网址了哈 URLConnection uconn = url.openConnection...
  • 51jiob Java爬虫 带数据统计 帮助大家分析当地工资水平,结合自身实际选择跳槽还是留。
  • Java爬虫之爬取数据

    2019-11-19 18:53:38
    出来的图片就不给大家分享啦,怕被举报,嘻嘻嘻 现在就来分享一下我的项目结构 我建的是maven项目,大家也可以这样建哦 现在就分享一下pom.xml文件 <dependencies> <dependency> <groupId>...
  • Java爬虫--页面跳转爬数据

    千次阅读 2017-08-29 19:12:17
    java爬取中关村相关页面数据
  • Java爬虫

    万次阅读 多人点赞 2019-06-11 17:32:29
    在大数据时代,我们要获取更多数据,就要进行数据的挖掘、分析、筛选,比如当我们做一个项目的时候,需要大量真实的数据的时候,就需要去某些网站进行爬取,有些网站的数据爬取后保存到数据库还不能够直接使用,需要进行...
  • java爬虫实现数据抓取

    千次阅读 2016-09-12 21:09:58
    这篇博客我们将用java来实现基本的爬虫数据抓取操作,基本思路就是通过java访问url,然后拿到返回的html文档,并用jsoup解析。 首先我们来看下需要抓取的页面,以我自己的csdn博客列表页面为例。我们将每篇博客的...
  • 爬虫程序 java爬虫程序 java 数据挖掘爬虫程序 java 数据挖掘爬虫程序 java 数据挖掘
  • java爬图片数据 demo

    2020-07-02 15:39:42
    import java.util.UUID; public class UrlFileDownloadUtil { public static void downloadPicture(String url) { String file = "D:\\image\\"; try { URL u = new URL(url); String name = UUID.randomUUID()....
  • 首先这个爬虫是公司要求在网上爬取数据Java爬取数据就不是优势所以很多框架都不是很成熟,查阅了很久可以用HtmlUnit和jsoup两个框架可以异步爬取到ajax响应的数据 用到依赖 <dependencies> <...
  • Java爬虫获取网页表格数据并保存到MySQL数据库,包含完整代码
  • java爬虫

    千次阅读 多人点赞 2019-04-12 11:59:12
    ​ 获取数据 ​ 解析数据 ​ 保存数据 案例一: 爬取起点中文网的小说案例 案例二: 使用爬虫程序登录某个网站, 获取此用户下的信息 1. 爬虫的基本概念 1.1 什么是爬虫: ​ 网络爬虫是一个程序, 采用一种特定的解析...
  • 前言部分 前置说明 ​ 截止到本文发表前,该爬虫方法因为htmlunit的问题,已经无法正常运行,由于本人后续不再维护相关功能,所以不会修复此问题。如果是迫切需要解决方案的...获取到的数据结构如下所示(部分长数据
  • 做爬虫的时候数据的清洗也是一大重点,往往绕不开三种处理办法 1.正则表达式 2.xpath表达式 3.json 关于java爬虫发送请求问题 java解析json格式,需要用到第三方包,所以需要用maven下载 <dependency> <...
  • Java爬虫获取网页表格数据

    千次阅读 2018-07-07 15:39:11
    //Java爬虫获取网页表格数据public class Pachong implements Runnable { public void run() { String Rpt_date = null; double price = 0; //网页地址 String url = "...
  • 在本篇内容中小编给大家分享了java爬虫jsoup怎么解析多空格class数据的方法和技巧,需要的朋友们跟着学习下。
  • 现在爬数据越来越难,各种反爬,简单的网站没做什么反爬,就随便介绍下: 1.随便找点网站弄点免费的http代理ip,去爬一下,太简单就不介绍了,目前最好用的代理ip是蘑菇代理 具体说下,稍微有点爬虫技术含量的吧,...
  • java爬虫之抓取城市数据

    千次阅读 2017-09-26 22:40:06
    java爬虫之抓取城市数据 需求:将网址中的城市地址信息抓取出来并持久化。完成三级城市联动查询。 我这里介绍一下三级城市的爬取。 分析网页中的信息 打开调试可以看到如下信息: 可知城市信息都...

空空如也

空空如也

1 2 3 4 5 ... 20
收藏数 3,465
精华内容 1,386
关键字:

java爬数据

java 订阅