9.异步爬虫_代码007(未授权)

本文介绍: 异步爬虫可以理解为非只单线程爬虫我们下面做个例子，之前我们通过单线程爬取过梨视频在保存视频的时候会慢一些，为了提升效率，我们使用异步爬虫爬取。

异步爬虫可以理解为非只单线程爬虫

我们下面做个例子，之前我们通过单线程爬取过梨视频 https://blog.csdn.net/potato123232/article/details/135672504

在保存视频的时候会慢一些，为了提升效率，我们使用异步爬虫爬取

import requests
from lxml import etree
import random
import re
from multiprocessing.dummy import Pool

# 保存根页面
url = 'https://www.pearvideo.com/popular'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'}
response = requests.get(url=url,headers=headers)
response.encoding = response.apparent_encoding
with open('./test.html','w',encoding='utf-8') as f:
    f.write(response.text)
print(response)

# 获取所有细节页面url
detail_htmls = []
tree = etree.HTML(response.text)
for i in range(1,len(tree.xpath('//*[@id="popularList"]/li'))+1):
    detail_htmls.append({'url':'https://www.pearvideo.com/' + tree.xpath('//*[@id="popularList"]/li[{}]/a/@href'.format(i))[0],'title':tree.xpath('//*[@id="popularList"]/li['+ str(i) +']/div[2]/a/h2/text()')[0]})

# print(detail_htmls)
p = re.compile(r'.*/(.*?)-d')

video_detail_list = []
for i in detail_htmls:
    contId = i['url'].split('_')[-1]
    mrd = round(random.random(), 16)
    headers['Host'] = 'www.pearvideo.com'
    headers['Referer'] = i['url']
    response = requests.get(url='https://www.pearvideo.com/videoStatus.jsp?contId=' + str(contId) + '&mrd=' + str(mrd),
                            headers=headers).text

    srcUrl = eval(response).get('videoInfo').get('videos').get('srcUrl')

    need_change_part = p.findall(srcUrl)[0]
    true_video_url = srcUrl.split(need_change_part)[0] + 'cont-' + contId + srcUrl.split(need_change_part)[1]

    video_name = re.sub(r'[\/:*?"<>|]', '', i['title'])
    video_detail_list.append({"name":video_name,"url":true_video_url})

print(video_detail_list)
def get_video(item):
    response = requests.get(item['url'])
    with open('./result/' + str(item['name']) + '.mp4', 'wb') as fp:
        fp.write(response.content)
        print(item['url'] + '下载成功')

pool = Pool(4)
pool.map(get_video,video_detail_list)

import requests
from lxml import etree
import random
import re
import aiohttp
import asyncio
import aiofiles

# 保存根页面
url = 'https://www.pearvideo.com/popular'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'}
response = requests.get(url=url,headers=headers)
response.encoding = response.apparent_encoding
with open('./test.html','w',encoding='utf-8') as f:
    f.write(response.text)
print(response)

# 获取所有细节页面url
detail_htmls = []
tree = etree.HTML(response.text)
for i in range(1,len(tree.xpath('//*[@id="popularList"]/li'))+1):
    detail_htmls.append({'url':'https://www.pearvideo.com/' + tree.xpath('//*[@id="popularList"]/li[{}]/a/@href'.format(i))[0],'title':tree.xpath('//*[@id="popularList"]/li['+ str(i) +']/div[2]/a/h2/text()')[0]})

# print(detail_htmls)
p = re.compile(r'.*/(.*?)-d')

video_detail_list = []
for i in detail_htmls:
    contId = i['url'].split('_')[-1]
    mrd = round(random.random(), 16)
    headers['Host'] = 'www.pearvideo.com'
    headers['Referer'] = i['url']
    response = requests.get(url='https://www.pearvideo.com/videoStatus.jsp?contId=' + str(contId) + '&mrd=' + str(mrd),
                            headers=headers).text

    srcUrl = eval(response).get('videoInfo').get('videos').get('srcUrl')

    need_change_part = p.findall(srcUrl)[0]
    true_video_url = srcUrl.split(need_change_part)[0] + 'cont-' + contId + srcUrl.split(need_change_part)[1]

    video_name = re.sub(r'[\/:*?"<>|]', '', i['title'])
    video_detail_list.append({"name":video_name,"url":true_video_url})

print(video_detail_list)

async def test(item):
    async with aiohttp.ClientSession() as session:
        async with await session.get(item['url']) as response:
            async with aiofiles.open('./result/' + str(item['name']) + '.mp4', 'wb') as fp:
                await fp.write(await response.read())
                print(item['url'] + '下载成功')

future_list = []
for something1 in video_detail_list:
    a = test(something1)
    future = asyncio.ensure_future(a)
    future_list.append(future)

loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(future_list))