这是一个异步爬虫,上代码

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    async def fetch(url, semaphore):
        async with semaphore:
            async with aiohttp.ClientSession() as session:
                try:
                    async with session.get(url, headers=headers, timeout=10) as response:
                        # proxies = {"http": "http://10.10.1.10:3128","https": "http://10.10.1.10:1080"}
                        # async with session.get(url, headers=headers, timeout=10,proxy=proxies) as response:#代理IP设置
                        return await response.text(), url
                except:
                    return """<html&gt;<head&gt;<title&gt;Error</title&gt;</head&gt;</html&gt;""", url
    async def main():
        urls=[]#网址自定义
        semaphore = asyncio.Semaphore(500)#设置默认并发数500,在windows最大为512, Linux限制为1024
        tasks = [fetch(url,semaphore) for url in urls]
        responses = await asyncio.gather(*tasks)
        for response in responses:
            print(response[0],response[1])#可以这里处置返回网址数据

    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

将其放进Threading线程时候报错

asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)

https://www.codenong.com/48725890/

解决方法

调用代码

        loop = asyncio.get_event_loop()#获取线程事件
        loop.run_until_complete(main())#调用

替换如下

        loop = asyncio.new_event_loop()#新建一个线程事件
        asyncio.set_event_loop(loop)#设置线程事件
        loop.run_until_complete(main())#调用

分析可能是由于直接调用线程事件threading.Thread冲突

        t1=Reptile_Thread()
        t1.start()
        print("运行")
class Reptile_Thread(threading.Thread):
    """网站爬取线程"""
    def __init__(self,parent=None):
        super(Reptile_Thread, self).__init__(parent)
    def run(self):
    	    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    async def fetch(url, semaphore):
        async with semaphore:
            async with aiohttp.ClientSession() as session:
                try:
                    async with session.get(url, headers=headers, timeout=10) as response:
                        # proxies = {"http": "http://10.10.1.10:3128","https": "http://10.10.1.10:1080"}
                        # async with session.get(url, headers=headers, timeout=10,proxy=proxies) as response:#代理IP设置
                        return await response.text(), url
                except:
                    return """<html><head><title>Error</title></head></html>""", url
    async def main():
        urls=[]#网址定义
        semaphore = asyncio.Semaphore(500)#设置默认并发数500,在windows最大为512, Linux中限制为1024
        tasks = [fetch(url,semaphore) for url in urls]
        responses = await asyncio.gather(*tasks)
        for response in responses:
            print(response[0],response[1])#可以这里处置返回网址数据
    #使用一下方法可以解决错误
    loop = asyncio.new_event_loop()#新建一个线程事件
    asyncio.set_event_loop(loop)#设置线程事件
    loop.run_until_complete(main())#调用

原文地址:https://blog.csdn.net/zx520113/article/details/134793745

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任

如若转载,请注明出处:http://www.7code.cn/show_42110.html

如若内容造成侵权/违法违规/事实不符,请联系代码007邮箱suwngjj01@126.com进行投诉反馈,一经查实,立即删除

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注