python爬虫之抓取小说(逆天邪神)

2022-03-06 23:05:11

申明:自我娱乐,对自我学习过程的总结。

环境:

项目目标:

最终效果:都已实现。可以判断小说更新了没;更新了就下载下来;通过调整小说的已看章节数(就是你上次浏览小说章节位置记录)可以达到直接保存整本小说。

项目实现流程:

我这里只写了一个main.py,就一个主函数解决了。

​​import requestsimport refrom bs4 import BeautifulSoupimport os​if __name__ == '__main__':    novel_url = "https://www.bige3.com/book/1030/"      return_value = is_update(novel_url)      if return_value == 0:        print("小说尚未更新!")    else:        print("小说已更新" + str(return_value) +"章!")        print("正在下载已更新的小说......")        download_novel(return_value)
def is_update(url):    heards = {        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"    }    try:        resp = requests.get(url, headers=heards)        resp.raise_for_status()          resp.encoding = 'utf-8'    except:        print("爬取失败")​    resp = re.findall(r'(.*?)', resp.text)    with open("小说更新记录.txt", "r", encoding='utf-8') as f:          data = f.read()      if data == str(resp[-1]):        return 0    else:        data_num = re.findall(r'\d+', data)          data_num = ''.join(data_num)          resp_num = re.findall(r'\d+', resp[-1])        resp_num = ''.join(resp_num)        gap_num = int(resp_num)-int(data_num)          with open("小说更新记录.txt", "w", encoding='utf-8') as f:              f.write(str(resp[-1]))              print("writing is ok!")        return gap_num
def download_novel(return_value):    if return_value >= 1:        for i in range(1, return_value+1, 1):            print(i)            with open("小说更新记录.txt", "r", encoding='utf-8') as f:                  data = f.read()                  data_num = re.findall(r'\d+', data)                  data_num = ''.join(data_num)                  download_num = int(data_num)+1-(i-1)                print(novel_url+str(download_num)+'.html')            resp = requests.get(novel_url+str(download_num)+'.html')            soup = BeautifulSoup(resp.text, 'lxml')            soup.select('#chaptercontent')            mytxt = soup.text[soup.text.find('下一章'):soup.text.rfind('『点此报错')]            mytxt = mytxt[3:]            mytxt = mytxt.strip()            mytxt = mytxt.replace('  ', '\n')            novel_save_location = "./novel_downloads/逆天邪神第"+str(download_num-1)+"章.txt"            with open(novel_save_location, "w", encoding='utf-8') as f:                  f.write(mytxt)            print("下载完毕!")    else:        print("invalid parameter!")

注意:

​import requestsfrom lxml import etreeimport refrom bs4 import BeautifulSoupimport os​def is_update(url):    heards = {        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"    }    try:        resp = requests.get(url, headers=heards)        resp.raise_for_status()          resp.encoding = 'utf-8'    except:        print("爬取失败")​    resp = re.findall(r'(.*?)', resp.text)    with open("小说更新记录.txt", "r", encoding='utf-8') as f:          data = f.read()      if data == str(resp[-1]):        return 0    else:        data_num = re.findall(r'\d+', data)          data_num = ''.join(data_num)          resp_num = re.findall(r'\d+', resp[-1])        resp_num = ''.join(resp_num)        gap_num = int(resp_num)-int(data_num)          with open("小说更新记录.txt", "w", encoding='utf-8') as f:              f.write(str(resp[-1]))              print("writing is ok!")        return gap_num​​def download_novel(return_value):    if return_value >= 1:        for i in range(1, return_value+1, 1):            print(i)            with open("小说更新记录.txt", "r", encoding='utf-8') as f:                  data = f.read()                  data_num = re.findall(r'\d+', data)                  data_num = ''.join(data_num)                  download_num = int(data_num)+1-(i-1)                print(novel_url+str(download_num)+'.html')            resp = requests.get(novel_url+str(download_num)+'.html')            soup = BeautifulSoup(resp.text, 'lxml')            soup.select('#chaptercontent')            mytxt = soup.text[soup.text.find('下一章'):soup.text.rfind('『点此报错')]            mytxt = mytxt[3:]            mytxt = mytxt.strip()            mytxt = mytxt.replace('  ', '\n')            novel_save_location = "./novel_downloads/逆天邪神第"+str(download_num-1)+"章.txt"            with open(novel_save_location, "w", encoding='utf-8') as f:                  f.write(mytxt)            print("下载完毕!")    else:        print("invalid parameter!")​​if __name__ == '__main__':    novel_url = "https://www.bige3.com/book/1030/"      return_value = is_update(novel_url)    if return_value == 0:        print("小说尚未更新!")    else:        print("小说已更新" + str(return_value) +"章!")        print("正在下载已更新的小说......")        download_novel(return_value)    os.system("pause")​

缺点:单线程,没有使用异步协议,也没有使用线程池来实现从小说下载大量章节时的快速下载的优势。之后,对代码进行优化,实现相应的功能。

[En]

Disadvantages: single-threaded, no asynchronous protocol is used, and no thread pool is used to achieve the advantage of fast downloading when there are a large number of chapters downloaded from the novel. After that, the code will be optimized and the corresponding functions will be realized.

实现效果:

例如章节是目前是

最新章节为:1936章 灾厄奏鸣 ,我改个数字演示。

不改话,就没有新章节更新:

改后跑起来,应该是

对应的文件夹里是:

打开后内容是:

Over!!!!!

步骤:

结果是:

项目中用到的知识点:

这里面可以有些在优化程序时被我给去掉了,嘿嘿

resp = requests.get(url, headers=heards)
data_num = re.findall(r'\d+', data)  data_num = ''.join(data_num)
resp = re.findall(r'(.*?)', resp.text)

encoding=’utf-8′ 是有必要的,不然会报错。

with open("小说更新记录.txt", "r", encoding='utf-8') as f:      data = f.read()
with open("小说更新记录.txt", "w", encoding='utf-8') as f:      f.write(str(resp[-1]))

表示识别标签

soup = BeautifulSoup(resp.text, 'lxml')soup.select('#chaptercontent')
resp[-1]
data_num = re.findall(r'\d+', data)
soup.text  str型find('下一章')  左边开始第一个索引rfind('『点此报错')   右边开始第一个索引
mytxt = soup.text[soup.text.find('下一章'):soup.text.rfind('『点此报错')]
novel_save_location = "./novel_downloads/逆天邪神第"+str(download_num-1)+"章.txt"

1.里面 有空白,直接用

mytxt = mytxt.strip()

我不知道我为什么不把它处理掉。我记得听网课说:去掉空格、空格、换行符,其他一切似乎都不见了,终于小说之间有了一些空隙。

[En]

I don’t know why I didn’t get rid of it. I remember listening to the online class said: remove spaces, blanks, line feeds, everything else seems to have gone, and finally there are some gaps between the novels.

解决方式:因为没有发现是啥符号(notepad++),于是之间将空白拿过来用(copy)。

mytxt=mytxt.replace('  ', '\n')

感谢观看!!!第一次写,好慢,好菜,回去写作业去了。呜呜呜

Original: https://www.cnblogs.com/bluemapleleaf/p/15974104.html
Author: bulemaple
Title: python爬虫之抓取小说(逆天邪神)

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/56087/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

最近整理资源【免费获取】:   👉 程序员最新必读书单  | 👏 互联网各方向面试题下载 | ✌️计算机核心资源汇总