精华内容
下载资源
问答
  • Python爬虫实例

    千次阅读 2017-09-19 14:28:34
    Python爬虫实例
    '''
    中国大学排名项目
    功能描述
    输入:大学排名URL链接
    输出:大学排名信息的屏幕输出(排名,大学名称,总分)
    技术路线:requests-bs4
    定向爬虫:仅对输入URL进行爬取,不扩展爬取
    '''
    
    '''
    程序的结构设计
    步骤1:从网络上获取大学排名网页内容
    步骤2:提取网页内容中信息到合适的数据结构(二维列表)
    步骤3:利用数据结构展示并输出结果
    '''
    import re
    from bs4 import BeautifulSoup
    import bs4
    import requests
    
    #从网络上获取大学排名网页内容
    def getHTMLText(url):
        try:
            r = requests.get(url,timeout = 30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            return ""
    
    #提取网页内容中信息到合适的数据结构
    def fillUnivList(uList,html):
        soup = BeautifulSoup(html,"html.parser")
        for tr in soup.find("tbody").children:
            if isinstance(tr,bs4.element.Tag):
                tds = tr("td")
                uList.append([tds[0]].string,tds[1].string)
        pass
    
    #利用数据结构展示并输出结果
    def printUnivList(uList,num):
        print("{:^10}\t{:^6}\t{:^10}".format("排名","学校","分数"))
        for i in range(num):
            u = uList[i]
            print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))
        print("Suc"+str(num))
    
    def main():
        uinfo = []
        url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html"
        html=getHTMLText(url)
        fillUnivList(uinfo,html)
        printUnivList(uinfo,20)# 20 univs
    
    '''
    优化
    中文对齐问题的原因
    采用中文字符的空格填充chr(12288)
    
    '''
    
    '''
    NumPy的随机数函数子库
    NumPy的random子库
    rand()
    randn()
    randint()
    seed()
    
    np.random的随机函数
    uniform(low,high,size)
    normal(loc,scale,size)
    poisson(lam,size)
    
    '''
    
    
    展开全文
  • python 爬虫实例

    2020-09-25 15:17:04
    python 爬虫实例 1. 爬取图片并下载 准备工作:  pip install requests  pip install BeautifulSoup4  pip install lxml 目录结构 代码实例: import os import re from uuid import uuid1 ...

    python 爬虫实例

     

    1. 爬取图片并下载

    准备工作:

      pip install requests

      pip install BeautifulSoup4

      pip install lxml

    目录结构

    代码实例:

     

    复制代码

    import os
    import re
    from uuid import uuid1
    import requests
    from bs4 import BeautifulSoup
    from random import choice
    
    
    # 获取随机请求头
    def get_headers():
        file = open('user_agent.txt', 'r')
        user_agent_list = file.readlines()
        user_agent = str(choice(user_agent_list)).replace('\n', '')
        user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0' if len(
            user_agent) < 10 else user_agent
        headers = {
            "User-Agent": user_agent,
        }
        return headers
    
    
    # 负责下载图片
    def download(src, end):
        try:
            headers = get_headers()
            response = requests.get(src, headers=headers)
            # 获取的文本实际上是图片的二进制文本
            img = response.content
            print(img)
            path = "images/" + str(uuid1()) + end
            # 将他拷贝到本地文件 w 写  b 二进制  wb代表写入二进制文本
            with open(path, 'wb') as f:
                f.write(img)
        except Exception as e:
            pass
    
    
    # 负责请求页面
    def requests_get(url):
        try:
            headers = get_headers()
            # 请求页面
            response = requests.get(url, headers=headers)
            # 解析
            soup = BeautifulSoup(response.text, 'lxml')
            image_list = soup.find_all(attrs={"class": "img-responsive"})
            for image in image_list[:-1]:
                # 获取图片链接
                src = image.attrs["data-backup"]
                # 获取图片后缀
                end = os.path.splitext(src)[1]
                if src and end:
                    # 去除特殊字符
                    end = re.sub(r'[,。??,/\\·]', '', end)
                    # 调用下载函数
                    download(src, end)
                else:
                    pass
        except Exception as e:
            print(e)
            pass
    
    
    if __name__ == '__main__':
        # 负责翻页
        for page in range(1, 5):
            url = 'https://www.doutula.com/photo/list/?page=%d' % page
            requests_get(url)

    复制代码

     

     

     

    结果:

    2. 爬取汽车之家新闻

    代码实例:

    复制代码

    import requests
    from bs4 import BeautifulSoup
    
    # 请求网页
    response = requests.get("https://www.autohome.com.cn/news/")
    # 设置编码格式
    response.encoding = 'gbk'
    # 页面解析
    soup = BeautifulSoup(response.text,'html.parser')
    # 找到id="auto-channel-lazyload-article" 的div节点
    div = soup.find(name='div',attrs={'id':'auto-channel-lazyload-article'})
    # 在div中找到所有的li标签
    li_list = div.find_all(name='li')
    for li in li_list:
        # 获取新闻标题
        title = li.find(name='h3')
        if not title:
            continue
        # 获取简介
        p = li.find(name='p')
        # 获取连接
        a = li.find(name='a')
        # 获取图片链接
        img = li.find(name='img')
        src = img.get('src')
        src = "https:" + src
        print(title.text)
        print(a.attrs.get('href'))
        print(p.text)
        print(src)
        # 再次发起请求,下载图片
        file_name = src.rsplit('images/',maxsplit=1)[1]
        ret = requests.get(src)
        with open(file_name,'wb') as f:
            f.write(ret.content)

    复制代码

    结果:

     

     

    3. 爬取unsplash图片并下载

    目录结构:

    代码实例:

    复制代码

    # 爬取图片
    
    import time
    import requests
    import json
    
    
    # 获取图片列表
    def get_image_list(url):
        response = requests.get(url=url)
        data_list = json.loads(response.text)
        for data in data_list:
            id = data["id"]
            image_list = [
                {
                    "file_path" : "static/images/" + id + "-raw.png",
                    "url": data["urls"]["raw"]
                },
                {
                    "file_path": "static/images/" + id + "-full.png",
                    "url": data["urls"]["full"]
                },
                {
                    "file_path": "static/images/" + id + "-regular.png",
                    "url": data["urls"]["regular"]
                },
                {
                    "file_path": "static/images/" + id + "-thumb.png",
                    "url": data["urls"]["thumb"]
                },
                {
                    "file_path": "static/images/" + id + "-small.png",
                    "url": data["urls"]["small"]
                }
            ]
            for image in image_list:
                download_image(image)
    
    # 下载图片
    def download_image(image):
        print(image)
        url = image["url"]
        response = requests.get(url)
        # 获取的文本实际上是图片的二进制文本
        img = response.content
        # 将他拷贝到本地文件 w 写  b 二进制  wb代表写入二进制文本
        with open(image["file_path"],'wb' ) as f:
            f.write(img)
    
    
    if __name__ == '__main__':
        for i in range(2,100):
            url = "https://unsplash.com/napi/photos?page={}&per_page=12".format(i)
            get_image_list(url)
            time.sleep(60)

    复制代码

    结果:(每个图片有五种大小)

    4. 爬取美女壁纸

    目录结构:

    代码实例:

    复制代码

    # 爬取图片
    
    import time
    import requests
    from bs4 import BeautifulSoup
    
    class Aaa():
        headers = {
            "Cookie": "__cfduid=db706111980f98a948035ea8ddd8b79c11589173916",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
        }
    
        def get_cookies(self):
            url = "http://www.netbian.com/"
            response = requests.get(url=url)
            self.headers ={
                "Cookie":"__cfduid=" + response.cookies["__cfduid"],
                "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
            }
    
        # 获取图片列表
        def get_image_list(self,url):
            try:
                response = requests.get(url=url,headers=self.headers)
                response.encoding = 'gbk'
                soup = BeautifulSoup(response.text,'lxml')
                li_list = soup.select("#main > div.list > ul > li")
                for li in li_list:
                    href = "http://www.netbian.com" + li.select_one("a").attrs["href"]
                    self.get_image(href)
            except:
                self.get_cookies()
    
    
        def get_image(self,href):
            try:
                response = requests.get(url=href,headers=self.headers)
                response.encoding = 'gbk'
                soup = BeautifulSoup(response.text, 'lxml')
                image_href = "http://www.netbian.com" + soup.select_one("#main > div.endpage > div > p > a").attrs["href"]
                self.get_image_src(image_href)
            except:
                self.get_cookies()
    
    
        def get_image_src(self,href):
            try:
                response = requests.get(url=href,headers=self.headers)
                response.encoding = 'gbk'
                soup = BeautifulSoup(response.text, 'lxml')
                src = soup.select("img")[1].attrs["src"]
                self.download_image(src)
            except:
                self.get_cookies()
    
        # 下载图片
        def download_image(self,image_src):
            try:
                title = str(time.time()).replace('.', '')
                image_path = "static/images/" + title + ".png",
                image_path = list(image_path)
                response = requests.get(image_src,headers=self.headers)
                # 获取的文本实际上是图片的二进制文本
                img = response.content
                # 将他拷贝到本地文件 w 写  b 二进制  wb代表写入二进制文本
                with open(image_path[0],'wb' ) as f:
                    f.write(img)
            except:
                self.get_cookies()
    
    
    if __name__ == '__main__':
        aaa = Aaa()
        aaa.get_cookies()
        for i in range(2,100):
            url = "http://www.netbian.com/meinv/index_{}.htm".format(i)
            aaa.get_image_list(url)
            time.sleep(10)

    复制代码

    结果:

    展开全文
  • python爬虫实例

    2018-12-31 21:21:00
    python爬虫实例 这里有两个爬虫的实例,是刚开始学python用的,一个是爬取京东茅台酒评论的,另一个是爬取新浪网国内新闻的,两个都是网上的教程里边的,代码略微有些不同,供参考学习。 都可以在andconda里...

    python爬虫实例

     

    这里有两个爬虫的实例,是刚开始学python用的,一个是爬取京东茅台酒评论的,另一个是爬取新浪网国内新闻的,两个都是网上的教程里边的,代码略微有些不同,供参考学习。

    都可以在andconda里跑

    复制代码
    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    import json
    import pandas
    news_total=[]
    commentURL='http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'
    url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1509373193047'
    def parseListLinks(url):
        newsdetails=[]
        res = requests.get(url)
        jd= json.loads(res.text.strip().lstrip('newsloadercallback(').rstrip(');'))
        for ent in jd['result']['data']:
            newsdetails.append(getNewsDetail(ent['url']))
        return newsdetails
            
    def getNewsDetail(newsurl):
        result={}
        res=requests.get(newsurl)
        res.encoding='utf-8'
        soup=BeautifulSoup(res.text,'html.parser')     
        result['title']=soup.select('#artibodyTitle')[0].text
        result['newssource']=soup.select('.time-source span a')[0].text
        timesource=soup.select('.time-source')[0].contents[0].strip()
        dt1=datetime.strptime(timesource,'%Y年%m月%d日%H:%M')
        result['dt'] =dt1.strftime('%Y-%m-%d-%H:%M')
        result['article']=' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])
        result['editor']=soup.select('.article-editor')[0].text.strip('责任编辑:')
        result['comments']=getCommentCounts(newsurl)
        print('获得一条新闻')
        return result      
           
        
    def getCommentCounts(newsurl):
        m=re.search('doc-i(.+).shtml',newsurl)
        newsid=m.group(1)
        comments=requests.get(commentURL.format(newsid))
        jd=json.loads(comments.text.strip('var data='))
        return jd['result']['count']['total'] 
    
    for i in range(1,8):
        print('正在爬取第'+str(i)+'页......')
        newsurl=url.format(i)
        newsary= parseListLinks(newsurl)
        news_total.extend(newsary)
    print('抓取结束')                                 
    df=pandas.DataFrame(news_total)
    df.to_excel('news.xlsx')
    复制代码

     

    复制代码
    import requests 
    import re
    import json
    import time
    import xlwt
    
    #
    #
    #配置表格
    #不需要明白是干啥的
    #有下面4行代码就可以往表格写中文了
    #
    style=xlwt.XFStyle()
    font=xlwt.Font()
    font.name='SimSun'
    style.font=font
    
    #创建一个表格
    w=xlwt.Workbook(encoding='utf-8')
    #添加个sheet
    ws=w.add_sheet('sheet 1',cell_overwrite_ok=True)
    #当前写入表格到第 row行
    row=1
    #
    #写入表格头
    #
    ws.write(0,0,'content')
    ws.write(0,1,'userClientShow')
    ws.write(0,2,'creationTime')
    ws.write(0,3,'userLevelName')
    ws.write(0,4,'productColor')
    ws.write(0,5,'userLevelId')
    ws.write(0,6,'score')
    ws.write(0,7,'referenceName')
    ws.write(0,8,'referenceTime')
    ws.write(0,9,'isMobile')
    ws.write(0,10,'nickname')
    
    #
    #接受一个json对象
    #将内容写进表格
    #一次一页评论
    #
    def write_json_to_xls(dat):
        global row
        for comment in dat['comments']:
            ws.write(row,0,comment['content'])
            ws.write(row,1,comment['userClientShow'])
            ws.write(row,2,comment['creationTime'])
            ws.write(row,3,comment['userLevelName'])
            ws.write(row,4,comment['productColor'])
            ws.write(row,5,comment['userLevelId'])
            ws.write(row,6,comment['score'])
            ws.write(row,7,comment['referenceName'])
            ws.write(row,8,comment['referenceTime'])
            ws.write(row,9,comment['isMobile'])
            ws.write(row,10,comment['nickname'])
            row+=1
    
    #
    #
    # 循环获取数据
    #
    #
    for i in range(1,10+1):
        url='https://club.jd.com/comment/productPageComments.action?productId=1475512465&score=0&sortType=5&page=%d&pageSize=100&isShadowSku=0&fold=' % i
        try:
            json_req = requests.get(url)
            dat = json_req.json()
            write_json_to_xls(dat)
            print(u'写入一页数据')
        except Exception as e:
           print(u'获取数据失败数据',e)
        time.sleep(0.5)
    
    
    #将数据存进表格
    w.save('result.xls')
    复制代码

     

     

    转载于:https://www.cnblogs.com/Jeremy2001/p/10203323.html

    展开全文
  • python 爬虫实例宋城路

    2017-11-07 16:40:13
    python 爬虫实例 python 爬虫实例 python 爬虫实例 python 爬虫实例 python 爬虫实例
  • Python进阶(二十)-Python爬虫实例讲解

    万次阅读 多人点赞 2017-03-26 10:15:47
    Python进阶(二十)-Python爬虫实例讲解  本篇博文主要讲解Python爬虫实例,重点包括爬虫技术架构,组成爬虫的关键模块:URL管理器、HTML下载器和HTML解析器。爬虫简单架构 程序入口函数(爬虫调度段)#coding:utf8 ...

    #Python进阶(二十)-Python爬虫实例讲解
      本篇博文主要讲解Python爬虫实例,重点包括爬虫技术架构,组成爬虫的关键模块:URL管理器、HTML下载器和HTML解析器。
    ##爬虫简单架构

    ![这里写图片描述](https://img-blog.csdnimg.cn/img_convert/e8f3a1db63d42ea7115d277ba869a35c.png) ##程序入口函数(爬虫调度段)
    #coding:utf8
    import time, datetime
    
    from maya_Spider import url_manager, html_downloader, html_parser, html_outputer
    
    
    class Spider_Main(object):
        #初始化操作
        def __init__(self):
            #设置url管理器
            self.urls = url_manager.UrlManager()
            #设置HTML下载器
            self.downloader = html_downloader.HtmlDownloader()
            #设置HTML解析器
            self.parser = html_parser.HtmlParser()
            #设置HTML输出器
            self.outputer = html_outputer.HtmlOutputer()
    
        #爬虫调度程序
        def craw(self, root_url):
            count = 1
            self.urls.add_new_url(root_url)
            while self.urls.has_new_url():
                try:
                    new_url = self.urls.get_new_url()
                    print('craw %d : %s' % (count, new_url))
                    html_content = self.downloader.download(new_url)
                    new_urls, new_data = self.parser.parse(new_url, html_content)
                    self.urls.add_new_urls(new_urls)
                    self.outputer.collect_data(new_data)
    
                    if count == 10:
                        break
    
                    count = count + 1
                except:
                    print('craw failed')
    
            self.outputer.output_html()
    
    if __name__ == '__main__':
        #设置爬虫入口
        root_url = 'http://baike.baidu.com/view/21087.htm'
        #开始时间
        print('开始计时..............')
        start_time = datetime.datetime.now()
        obj_spider = Spider_Main()
        obj_spider.craw(root_url)
        #结束时间
        end_time = datetime.datetime.now()
        print('总用时:%ds'% (end_time - start_time).seconds)
    

    ##URL管理器

    class UrlManager(object):
        def __init__(self):
            self.new_urls = set()
            self.old_urls = set()
    
        def add_new_url(self, url):
            if url is None:
                return
            if url not in self.new_urls and url not in self.old_urls:
                self.new_urls.add(url)
    
        def add_new_urls(self, urls):
            if urls is None or len(urls) == 0:
                return
            for url in urls:
                self.add_new_url(url)
    
        def has_new_url(self):
            return len(self.new_urls) != 0
    
        def get_new_url(self):
            new_url = self.new_urls.pop()
            self.old_urls.add(new_url)
            return new_url
    

    ##网页下载器

    import urllib
    import urllib.request
    
    class HtmlDownloader(object):
    
        def download(self, url):
            if url is None:
                return None
    
            #伪装成浏览器访问,直接访问的话csdn会拒绝
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = {'User-Agent':user_agent}
            #构造请求
            req = urllib.request.Request(url,headers=headers)
            #访问页面
            response = urllib.request.urlopen(req)
            #python3中urllib.read返回的是bytes对象,不是string,得把它转换成string对象,用bytes.decode方法
            return response.read().decode()
    

    ##网页解析器

    import re
    import urllib
    from urllib.parse import urlparse
    
    from bs4 import BeautifulSoup
    
    class HtmlParser(object):
    
        def _get_new_urls(self, page_url, soup):
            new_urls = set()
            #/view/123.htm
            links = soup.find_all('a', href=re.compile(r'/item/.*?'))
            for link in links:
                new_url = link['href']
                new_full_url = urllib.parse.urljoin(page_url, new_url)
                new_urls.add(new_full_url)
            return new_urls
    
        #获取标题、摘要
        def _get_new_data(self, page_url, soup):
            #新建字典
            res_data = {}
            #url
            res_data['url'] = page_url
            #<dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>获得标题标签
            title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find('h1')
            print(str(title_node.get_text()))
            res_data['title'] = str(title_node.get_text())
            #<div class="lemma-summary" label-module="lemmaSummary">
            summary_node = soup.find('div', class_="lemma-summary")
            res_data['summary'] = summary_node.get_text()
    
            return res_data
    
        def parse(self, page_url, html_content):
            if page_url is None or html_content is None:
                return None
    
            soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
            new_urls = self._get_new_urls(page_url, soup)
            new_data = self._get_new_data(page_url, soup)
            return new_urls, new_data
    

    ##网页输出器

    class HtmlOutputer(object):
    
        def __init__(self):
            self.datas = []
    
        def collect_data(self, data):
            if data is None:
                return
            self.datas.append(data )
    
        def output_html(self):
            fout = open('maya.html', 'w', encoding='utf-8')
            fout.write("<head><meta http-equiv='content-type' content='text/html;charset=utf-8'></head>")
            fout.write('<html>')
            fout.write('<body>')
            fout.write('<table border="1">')
            # <th width="5%">Url</th>
            fout.write('''<tr style="color:red" width="90%">
                        <th>Theme</th>
                        <th width="80%">Content</th>
                        </tr>''')
            for data in self.datas:
                fout.write('<tr>\n')
                # fout.write('\t<td>%s</td>' % data['url'])
                fout.write('\t<td align="center"><a href=\'%s\'>%s</td>' % (data['url'], data['title']))
                fout.write('\t<td>%s</td>\n' % data['summary'])
                fout.write('</tr>\n')
            fout.write('</table>')
            fout.write('</body>')
            fout.write('</html>')
            fout.close()
    

    ##运行结果

    这里写图片描述

    ##附
      完整代码

    ![这里写图片描述](https://img-blog.csdnimg.cn/img_convert/f9c024e20306fb0e4e3e84a15aab3217.png)
    展开全文
  • 本次的这篇文章主要是和大家分享了一篇关于记录一次简单的Python爬虫实例 ,有需要的小伙伴可以看一下。主要流程分为:爬取、整理、存储1.其中用到几个包,包括requests 用于向网站发送请求,并获得网页代码...
  • 本次的这篇文章主要是和大家分享了一篇关于记录一次简单的Python爬虫实例 ,有需要的小伙伴可以看一下。主要流程分为:爬取、整理、存储1.其中用到几个包,包括requests 用于向网站发送请求,并获得网页代码...
  • WechatSogou [1]- 微信公众号爬虫。基于搜狗微信搜索的微信公众号爬虫接口,可以扩展成基于搜狗搜索的爬虫,返回结果是列表,每一项均是公众号具体信息字典。DouBanSpider [2]- 豆瓣读书爬虫。可以爬下豆瓣读书标签...
  • Python 爬虫实例

    2019-10-01 15:42:38
    下面是我写的一个简单爬虫实例 1.定义函数读取html网页的源代码 2.从源代码通过正则表达式挑选出自己需要获取的内容 3.序列中的htm依次写到d盘 #!/usr/bin/python import re import urllib.request #定义...
  • #爬取图片importtimeimportrequestsfrom bs4 importBeautifulSoupclassAaa():headers={"Cookie": "__cfduid=db706111980f98a948035ea8ddd8b79c11589173916","User-Agent": "Mozilla/5.0 (Windows NT 10.0;...
  • python爬虫实例详解

    2020-09-20 09:18:23
    主要为大家详细介绍了python爬虫实例,包括爬虫技术架构,组成爬虫的关键模块,具有一定的参考价值,感兴趣的小伙伴们可以参考一下
  • WechatSogou [1]- 微信公众号爬虫。基于搜狗微信搜索的微信公众号爬虫接口,可以扩展成基于搜狗搜索的爬虫,返回结果是列表,每一项均是公众号具体信息字典。DouBanSpider [2]- 豆瓣读书爬虫。可以爬下豆瓣读书标签...

空空如也

空空如也

1 2 3 4 5 ... 20
收藏数 5,359
精华内容 2,143
关键字:

python爬虫实例

python 订阅
爬虫 订阅