python爬虫多进程,多线程,协程以及组合应用的效率对比(multiprocessing库)–以爬取单本小说全文为例

本篇将测试爬取单本小说下: 利用多进程,多线程,协程,以及多进程加多线程,多进程加协程组合应用的效率。

以爬取–笔趣阁–大道争锋为例,测试相关组合的性能。

  • 多线程

代码如下:

python爬虫多进程,多线程,协程以及组合应用的效率对比(multiprocessing库)--以爬取单本小说全文为例python爬虫多进程,多线程,协程以及组合应用的效率对比(multiprocessing库)--以爬取单本小说全文为例
-*- coding: utf-8 -*-
"""
Created on Wed Mar  4 10:39:55 2020

@author: wenzhe.tian

多进程+多线程
多进程+协程
"""

book_name_list=['大道争锋']

####### 开始工作
import time
from concurrent.futures import ThreadPoolExecutor
import requests
from lxml import etree
import os
import urllib.parse as parse

save_path='D:\\bqg_novel\\'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
target_url='https://m.52bqg.com'
try:
    os.mkdir(save_path)
except:
    pass

### 定义两个函数获取书的url和章节的url
def get_chapter_content(i): # 根据章节url返回章节内容
    chapter_now=requests.get(target_url+i,headers)
    chapter_now.encoding='gbk'
    chapter_now=chapter_now.text; #源码
    #    chapter_now_ori=chapter_now #检测用
    chapter_now=etree.HTML(chapter_now)
    chapter_content='\n'.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()'))
    next_page_num=1
    while '下一页' in chapter_now.xpath('//div[@class="nr_page"]//td[@class="next"]/descendant::text()'):
        chapter_content=chapter_content.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','')
        next_page_num=next_page_num+1;
        chapter_now=requests.get(target_url+i.replace('.html','_'+str(next_page_num)+'.html'),headers)
        chapter_now.encoding='gbk'
        chapter_now=chapter_now.text; #源码
        #        chapter_now_ori=chapter_now #检测用
        chapter_now=etree.HTML(chapter_now)
        chapter_content_next='\n'.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()'))
        chapter_content_next=chapter_content_next.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','')
        chapter_content=chapter_content+chapter_content_next;
    return chapter_content

def get_chapter_link(i): ########## 确定章节的数目,爬取所有章节link ######
    global url_all,headers
    if i==0:
        req_next=requests.get(url_all,headers)
    else:
        req_next=requests.get(url_all+'/'+str(i+1),headers)
    req_next.encoding='gbk'
    html_next=etree.HTML(req_next.text)
    chapter_name_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]//a/descendant::text()|//ul[@class="last9"]//li//a/descendant::text()')
    chapter_url_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]/a/@href|//ul[@class="last9"]//li/a/@href')
    chapter_name=chapter_name_next[1:]
    chapter_url=chapter_url_next[1:]
    return chapter_name,chapter_url

################################# 对于所有章节的url内容爬取 #####################################
novel=[]

for k in book_name_list:
    start=time.time()
    url='https://m.52bqg.com/modules/article/waps.php?searchtype=articlename&searchkey='+parse.quote(k,encoding="gbk")+'&t_btnsearch='
    req=requests.get(url,headers)
    req.encoding='gbk'
    if 'book_' in req.url and 'search' not in req.url: #搜索结果不是清单则直接开始爬
        url_all=req.url
        url_all=url_all.replace('book','chapters')
    else: #是清单则判断是否有完全匹配项,若无则只爬榜1
        search_result=req.text
        html_search=etree.HTML(search_result)
        search_book=html_search.xpath('//div[@class="article"]/a/text()')
        search_book_url=html_search.xpath('//div[@class="article"]/a[1]/@href')
        if k in search_book:
            url_all=target_url+search_book_url[search_book.index(k)]
            url_all=url_all.replace('book','chapters')
        else:
            url_all=target_url+search_book_url[0]
            url_all=url_all.replace('book','chapters')

    # 根据书名判断章节页数
    req_all=requests.get(url_all,headers)
    req_all.encoding='gbk'
    html_all=etree.HTML(req_all.text)
    chapter_page_all=html_all.xpath('//table[@class="page-book"]//td/a/@href')
    chapter_page_all=chapter_page_all[1].split('/')
    chapter_page_all=int(chapter_page_all[-1])
    # 开始多线程抓取
    with ThreadPoolExecutor(250) as executor:
        # 根据章节页数,得到章节url
        chapter=list(executor.map(get_chapter_link,range(chapter_page_all)))
        chapter=list(zip(*chapter))
        chapter_url=list(chapter[1])
        chapter_name=list(chapter[0])
        chapter_url = sum(chapter_url, [])
        chapter_name = sum(chapter_name, [])
        chapter_all=list(executor.map(get_chapter_content,chapter_url))
    end=time.time()
    print("耗时: "+str(int(end-start))+'秒') #计时统计
    for i in range(len(chapter_all)):
        chapter_all[i]=chapter_name[i]+'\n'+chapter_all[i]
    target='/n'.join(chapter_all)
    f = open(save_path+'\\'+k+'.txt','a+',encoding='utf-8')
    f.read()
    f.write(target)
    f.close()
    print(k+'已完成')

View Code

耗时: 70s

  • 协程

代码如下:

python爬虫多进程,多线程,协程以及组合应用的效率对比(multiprocessing库)--以爬取单本小说全文为例python爬虫多进程,多线程,协程以及组合应用的效率对比(multiprocessing库)--以爬取单本小说全文为例
-*- coding: utf-8 -*-
"""
Created on Wed Mar  4 10:39:55 2020

@author: wenzhe.tian

"""

book_name_list=['大道争锋']

####### 开始工作

import gevent
from gevent import monkey,pool
pool=pool.Pool(200)
monkey.patch_all(thread=False)
import requests
import time
from lxml import etree
import os
import urllib.parse as parse

save_path='D:\\bqg_novel\\'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
target_url='https://m.52bqg.com'
try:
    os.mkdir(save_path)
except:
    pass

### 定义两个函数获取书的url和章节的url
def get_chapter_content(i): # 根据章节url返回章节内容
    chapter_now=requests.get(target_url+i,headers)
    chapter_now.encoding='gbk'
    chapter_now=chapter_now.text; #源码
    #    chapter_now_ori=chapter_now #检测用
    chapter_now=etree.HTML(chapter_now)
    chapter_content='\n'.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()'))
    next_page_num=1
    while '下一页' in chapter_now.xpath('//div[@class="nr_page"]//td[@class="next"]/descendant::text()'):
        chapter_content=chapter_content.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','')
        next_page_num=next_page_num+1;
        chapter_now=requests.get(target_url+i.replace('.html','_'+str(next_page_num)+'.html'),headers)
        chapter_now.encoding='gbk'
        chapter_now=chapter_now.text; #源码
        #        chapter_now_ori=chapter_now #检测用
        chapter_now=etree.HTML(chapter_now)
        chapter_content_next='\n'.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()'))
        chapter_content_next=chapter_content_next.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','')
        chapter_content=chapter_content+chapter_content_next;
    return chapter_content

def get_chapter_link(i): ########## 确定章节的数目,爬取所有章节link ######
    global url_all,headers
    if i==0:
        req_next=requests.get(url_all,headers)
    else:
        req_next=requests.get(url_all+'/'+str(i+1),headers)
    req_next.encoding='gbk'
    html_next=etree.HTML(req_next.text)
    chapter_name_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]//a/descendant::text()|//ul[@class="last9"]//li//a/descendant::text()')
    chapter_url_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]/a/@href|//ul[@class="last9"]//li/a/@href')
    chapter_name=chapter_name_next[1:]
    chapter_url=chapter_url_next[1:]
    return chapter_name,chapter_url

################################# 对于所有章节的url内容爬取 #####################################
novel=[]

for k in book_name_list:
    start=time.time()
    url='https://m.52bqg.com/modules/article/waps.php?searchtype=articlename&searchkey='+parse.quote(k,encoding="gbk")+'&t_btnsearch='
    req=requests.get(url,headers)
    req.encoding='gbk'
    if 'book_' in req.url and 'search' not in req.url: #搜索结果不是清单则直接开始爬
        url_all=req.url
        url_all=url_all.replace('book','chapters')
    else: #是清单则判断是否有完全匹配项,若无则只爬榜1
        search_result=req.text
        html_search=etree.HTML(search_result)
        search_book=html_search.xpath('//div[@class="article"]/a/text()')
        search_book_url=html_search.xpath('//div[@class="article"]/a[1]/@href')
        if k in search_book:
            url_all=target_url+search_book_url[search_book.index(k)]
            url_all=url_all.replace('book','chapters')
        else:
            url_all=target_url+search_book_url[0]
            url_all=url_all.replace('book','chapters')

    # 根据书名判断章节页数
    req_all=requests.get(url_all,headers)
    req_all.encoding='gbk'
    html_all=etree.HTML(req_all.text)
    chapter_page_all=html_all.xpath('//table[@class="page-book"]//td/a/@href')
    chapter_page_all=chapter_page_all[1].split('/')
    chapter_page_all=int(chapter_page_all[-1])
    # 开始协程抓取
    g_list=list(map(lambda x:gevent.spawn(get_chapter_link, x),range(chapter_page_all)))
    gevent.joinall(g_list)
    chapter=[]
    for g in g_list:
        chapter.append(g.value)
    chapter=list(zip(*chapter))
    chapter_url=list(chapter[1])
    chapter_name=list(chapter[0])
    chapter_url = sum(chapter_url, [])
    chapter_name = sum(chapter_name, [])
    g_list=list(map(lambda x:gevent.spawn(get_chapter_content, x),chapter_url))
    gevent.joinall(g_list)
    chapter_all=[]
    for g in g_list:
        chapter_all.append(g.value)
    end=time.time()
    print("耗时: "+str(int(end-start))+'秒') #计时统计
    for i in range(len(chapter_all)):
        chapter_all[i]=chapter_name[i]+'\n'+chapter_all[i]
    target='/n'.join(chapter_all)
    f = open(save_path+'\\'+k+'.txt','a+',encoding='utf-8')
    f.read()
    f.write(target)
    f.close()
    print(k+'已完成')

View Code

耗时: 103s

  • 多进程调用多线程

代码如下:

python爬虫多进程,多线程,协程以及组合应用的效率对比(multiprocessing库)--以爬取单本小说全文为例python爬虫多进程,多线程,协程以及组合应用的效率对比(multiprocessing库)--以爬取单本小说全文为例
# -- coding: utf-8 --
"""
Created on Wed Mar  4 10:39:55 2020

@author: wenzhe.tian

多进程+多线程

"""

####### 开始工作import timefrom concurrent.futures import ThreadPoolExecutorimport requestsfrom lxml import etreeimport osimport urllib.parse as parsefrom multiprocessing import Poolbook_name_list=['斗罗大陆3龙王传说']save_path='D:\bqg_novel\'headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}target_url='https://m.52bqg.com'try: os.mkdir(save_path)except: passnovel=[]

##### 定义两个函数获取书的url和章节的urldef get_chapter_content(i): # 根据章节url返回章节内容 chapter_now=requests.get(target_url+i,headers) chapter_now.encoding='gbk' chapter_now=chapter_now.text; #源码 # chapter_now_ori=chapter_now #检测用 chapter_now=etree.HTML(chapter_now) chapter_content='\n'.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()')) next_page_num=1 while '下一页' in chapter_now.xpath('//div[@class="nr_page"]//td[@class="next"]/descendant::text()'): chapter_content=chapter_content.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','') next_page_num=next_page_num+1; chapter_now=requests.get(target_url+i.replace('.html','_'+str(next_page_num)+'.html'),headers) chapter_now.encoding='gbk' chapter_now=chapter_now.text; #源码 # chapter_now_ori=chapter_now #检测用 chapter_now=etree.HTML(chapter_now) chapter_content_next='\n'.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()')) chapter_content_next=chapter_content_next.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','') chapter_content=chapter_content+chapter_content_next; return chapter_content

def get_chapter_link(link): ########## 确定章节的数目,爬取所有章节link ###### i=link[0] url_all=link[1] if i==0: req_next=requests.get(url_all,headers) else: req_next=requests.get(url_all+str(i+1),headers) req_next.encoding='gbk' html_next=etree.HTML(req_next.text) chapter_name_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]//a/descendant::text()|//ul[@class="last9"]//li//a/descendant::text()') chapter_url_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]/a/@href|//ul[@class="last9"]//li/a/@href') chapter_name=chapter_name_next[1:] chapter_url=chapter_url_next[1:] return chapter_name,chapter_url

def run_proc(page): with ThreadPoolExecutor(200) as executor: # 根据章节页数,得到章节url i=list(page[0]) for k in range(len(i)): i[k]=[i[k],page[1]] chapter=list(executor.map(get_chapter_link,i)) chapter=list(zip(*chapter)) chapter_url=list(chapter[1]) chapter_name=list(chapter[0]) chapter_url = sum(chapter_url, []) chapter_name = sum(chapter_name, []) chapter_all=list(executor.map(get_chapter_content,chapter_url)) for i in range(len(chapter_all)): chapter_all[i]=chapter_name[i]+'\n'+chapter_all[i] return chapter_all

################################# 对于所有章节的url内容爬取 #####################################if name == 'main': for k in book_name_list: start=time.time() url='https://m.52bqg.com/modules/article/waps.php?searchtype=articlename&searchkey='+parse.quote(k,encoding="gbk")+'&t_btnsearch=' req=requests.get(url,headers) req.encoding='gbk' if 'book_' in req.url and 'search' not in req.url: #搜索结果不是清单则直接开始爬 url_all=req.url url_all=url_all.replace('book','chapters') else: #是清单则判断是否有完全匹配项,若无则只爬榜1 search_result=req.text html_search=etree.HTML(search_result) search_book=html_search.xpath('//div[@class="article"]/a/text()') search_book_url=html_search.xpath('//div[@class="article"]/a[1]/@href') if k in search_book: url_all=target_url+search_book_url[search_book.index(k)] url_all=url_all.replace('book','chapters') else: url_all=target_url+search_book_url[0] url_all=url_all.replace('book','chapters')

    </span><span>#</span><span> &#x6839;&#x636E;&#x4E66;&#x540D;&#x5224;&#x65AD;&#x7AE0;&#x8282;&#x9875;&#x6570;</span>
    req_all=<span>requests.get(url_all,headers)
    req_all.encoding</span>=<span>&apos;</span><span>gbk</span><span>&apos;</span><span>
    html_all</span>=<span>etree.HTML(req_all.text)
    chapter_page_all</span>=html_all.xpath(<span>&apos;</span><span>//table[@class=&quot;page-book&quot;]//td/a/@href</span><span>&apos;</span><span>)
    chapter_page_all</span>=chapter_page_all[1].split(<span>&apos;</span><span>/</span><span>&apos;</span><span>)
    chapter_page_all</span>=int(chapter_page_all[-1<span>])

    </span><span>#</span><span> &#x5BF9;&#x5C0F;&#x8BF4;&#x9875;&#x4F7F;&#x7528;&#x8FDB;&#x7A0B;&#x5904;&#x7406;</span>
    count=<span>0
    page_list</span>=<span>[]
    </span><span>while</span> count&lt;<span>chapter_page_all:
        next_count</span>=count+10
        <span>if</span> next_count&gt;<span>chapter_page_all:
            next_count</span>=<span>chapter_page_all;
        page_list.append([range(count,next_count),url_all])
        count</span>=count+10<span>

    p </span>= Pool(4<span>)
    result</span>=<span>p.map(run_proc, page_list)
    p.close()
    p.join()
    chapter_all</span>=<span> sum(result,[])
    end</span>=<span>time.time()
    </span><span>print</span>(<span>&quot;</span><span>&#x8017;&#x65F6;: </span><span>&quot;</span>+str(int(end-start))+<span>&apos;</span><span>&#x79D2;</span><span>&apos;</span>) <span>#</span><span>&#x8BA1;&#x65F6;&#x7EDF;&#x8BA1;</span>


target
='/n'.join(chapter_all)
f
= open(save_path+'\'+k+'.txt','a+',encoding='utf-8')
f.read()
f.write(target)
f.close()
print(k+'已完成')View Code

耗时: 40s

  • 多进程调用协程

代码如下:

python爬虫多进程,多线程,协程以及组合应用的效率对比(multiprocessing库)--以爬取单本小说全文为例python爬虫多进程,多线程,协程以及组合应用的效率对比(multiprocessing库)--以爬取单本小说全文为例
# -- coding: utf-8 --
"""
Created on Wed Mar  4 10:39:55 2020

@author: wenzhe.tian

多进程+多线程多进程+协程

"""

####### 开始工作import geventfrom gevent import monkeymonkey.patch_all(thread=False)import requestsimport timefrom lxml import etreeimport osimport urllib.parse as parsefrom multiprocessing import Pool

book_name_list=['大道争锋']save_path='D:\bqg_novel\'headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}target_url='https://m.52bqg.com'try: os.mkdir(save_path)except: passnovel=[]

##### 定义两个函数获取书的url和章节的urldef get_chapter_content(i): # 根据章节url返回章节内容 chapter_now=requests.get(target_url+i,headers) chapter_now.encoding='gbk' chapter_now=chapter_now.text; #源码 # chapter_now_ori=chapter_now #检测用 chapter_now=etree.HTML(chapter_now) chapter_content='\n'.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()')) next_page_num=1 while '下一页' in chapter_now.xpath('//div[@class="nr_page"]//td[@class="next"]/descendant::text()'): chapter_content=chapter_content.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','') next_page_num=next_page_num+1; chapter_now=requests.get(target_url+i.replace('.html','_'+str(next_page_num)+'.html'),headers) chapter_now.encoding='gbk' chapter_now=chapter_now.text; #源码 # chapter_now_ori=chapter_now #检测用 chapter_now=etree.HTML(chapter_now) chapter_content_next='\n'.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()')) chapter_content_next=chapter_content_next.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','') chapter_content=chapter_content+chapter_content_next; return chapter_content

def get_chapter_link(link): ########## 确定章节的数目,爬取所有章节link ###### i=link[0] url_all=link[1] if i==0: req_next=requests.get(url_all,headers) else: req_next=requests.get(url_all+str(i+1),headers) req_next.encoding='gbk' html_next=etree.HTML(req_next.text) chapter_name_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]//a/descendant::text()|//ul[@class="last9"]//li//a/descendant::text()') chapter_url_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]/a/@href|//ul[@class="last9"]//li/a/@href') chapter_name=chapter_name_next[1:] chapter_url=chapter_url_next[1:] return chapter_name,chapter_url

def run_proc(page): # 根据章节页数,得到章节url i=list(page[0]) for k in range(len(i)): i[k]=[i[k],page[1]] g_list=list(map(lambda x:gevent.spawn(get_chapter_link, x),i)) gevent.joinall(g_list) chapter=[] for g in g_list: chapter.append(g.value) chapter=list(zip(*chapter)) chapter_url=list(chapter[1]) chapter_name=list(chapter[0]) chapter_url = sum(chapter_url, []) chapter_name = sum(chapter_name, []) g_list=list(map(lambda x:gevent.spawn(get_chapter_content, x),chapter_url)) gevent.joinall(g_list) chapter_all=[] for g in g_list: chapter_all.append(g.value)

</span><span>for</span> i <span>in</span><span> range(len(chapter_all)):
    chapter_all[i]</span>=chapter_name[i]+<span>&apos;</span><span>\n</span><span>&apos;</span>+<span>chapter_all[i]
</span><span>return</span><span> chapter_all

################################# 对于所有章节的url内容爬取 #####################################
if name == 'main':
for k in book_name_list:
start
=time.time()
url
='https://m.52bqg.com/modules/article/waps.php?searchtype=articlename&searchkey='+parse.quote(k,encoding="gbk")+'&t_btnsearch='
req
=requests.get(url,headers)
req.encoding
='gbk'
if 'book_' in req.url and 'search' not in req.url: #搜索结果不是清单则直接开始爬
url_all=req.url
url_all
=url_all.replace('book','chapters')
else: #是清单则判断是否有完全匹配项,若无则只爬榜1
search_result=req.text
html_search
=etree.HTML(search_result)
search_book
=html_search.xpath('//div[@class="article"]/a/text()')
search_book_url
=html_search.xpath('//div[@class="article"]/a[1]/@href')
if k in search_book:
url_all
=target_url+search_book_url[search_book.index(k)]
url_all
=url_all.replace('book','chapters')
else:
url_all
=target_url+search_book_url[0]
url_all
=url_all.replace('book','chapters')

    </span><span>#</span><span> &#x6839;&#x636E;&#x4E66;&#x540D;&#x5224;&#x65AD;&#x7AE0;&#x8282;&#x9875;&#x6570;</span>
    req_all=<span>requests.get(url_all,headers)
    req_all.encoding</span>=<span>&apos;</span><span>gbk</span><span>&apos;</span><span>
    html_all</span>=<span>etree.HTML(req_all.text)
    chapter_page_all</span>=html_all.xpath(<span>&apos;</span><span>//table[@class=&quot;page-book&quot;]//td/a/@href</span><span>&apos;</span><span>)
    chapter_page_all</span>=chapter_page_all[1].split(<span>&apos;</span><span>/</span><span>&apos;</span><span>)
    chapter_page_all</span>=int(chapter_page_all[-1<span>])

    </span><span>#</span><span> &#x5BF9;&#x5C0F;&#x8BF4;&#x9875;&#x4F7F;&#x7528;&#x8FDB;&#x7A0B;&#x5904;&#x7406;</span>
    count=<span>0
    page_list</span>=<span>[]
    </span><span>while</span> count&lt;<span>chapter_page_all:
        next_count</span>=count+10
        <span>if</span> next_count&gt;<span>chapter_page_all:
            next_count</span>=<span>chapter_page_all;
        page_list.append([range(count,next_count),url_all])
        count</span>=count+10<span>

    p </span>= Pool(4<span>)
    result</span>=<span>p.map(run_proc, page_list)
    p.close()
    p.join()
    chapter_all</span>=<span> sum(result,[])
    end</span>=<span>time.time()
    </span><span>print</span>(<span>&quot;</span><span>&#x8017;&#x65F6;: </span><span>&quot;</span>+str(int(end-start))+<span>&apos;</span><span>&#x79D2;</span><span>&apos;</span>) <span>#</span><span>&#x8BA1;&#x65F6;&#x7EDF;&#x8BA1;</span>


target
='/n'.join(chapter_all)
f
= open(save_path+'\'+k+'.txt','a+',encoding='utf-8')
f.read()
f.write(target)
f.close()
print(k+'已完成')View Code

耗时: 60s

简单的看出多核应用大于单核,多线程当然好于无线程(不要受GIL锁对多线程误解,效率明显是提高的)

本测试中开多线程效率(线程数250)是要大于协程的。

下篇将会对全站小说爬取的多进程多线程调用总结以及scrapy应用的对比。

欢迎交流指正。有任何疑问直接丢评论区。

Original: https://www.cnblogs.com/techs-wenzhe/p/12550451.html
Author: 冻雨冷雾
Title: python爬虫多进程,多线程,协程以及组合应用的效率对比(multiprocessing库)–以爬取单本小说全文为例

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/564522/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球