简介
此程序是本人大三时期的Python大作业,初学Python后所编写的一个程序,是一个网络爬虫程序,可爬取指定网站的信息。
本程序爬取的网站是Bangumi-我看过的动画,Bangumi是一个专注于二次元的收视进度管理网站,在这里可以记录自己看过的动画和书籍、玩过的游戏、听过的音乐等等,本程序爬取的正是作者本人看过的所有动画,读者若想爬取自己看过的动画,可下载程序后,自行修改源代码中的相应网址。
本程序使用Python编写,使用PyCharm进行开发,数据库使用MySQL数据库,程序可将”Bangumi-我看过的动画”中的所有动画信息爬取下来,并保存至数据库和Excel表格中,亦可将爬取的网站html源码保存至本地,作者还编写了一个JavaWeb程序,用网页的形式展示爬取到的所有动画信息。
注:”Bangumi-我看过的动画”网页源码可能会改变,所以请注意该爬虫程序的时效性。
程序源代码及程序设计说明书可点击下方链接进行下载,供各位需要的人学习参考。
下载链接:Python爬虫-Bangumi
目录
程序代码
在此展示Python爬虫的完整代码,代码不多做介绍,详细请看代码注释或程序设计说明书,若读者对JavaWeb展示爬取数据感兴趣,可至文章开头处下载程序,进行了解。
import re
import ssl
import xlwt
import pymysql
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
def main():
print("开始爬取网站")
ssl._create_default_https_context = ssl._create_unverified_context
baseurl = "https://bangumi.tv/anime/list/430090/collect"
pagecount = getPageCount(baseurl)
datalist = getData(baseurl, pagecount)
saveDataToDatabase(datalist)
saveDataToExcel(datalist)
print("网站爬取成功,完毕!!!")
def getHTML(url):
print("正在获取页面 "+url+" ......")
headers = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36"
}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
print("页面获取成功")
return html
def saveHTML(html, page):
print("正在保存页面"+str(page)+"......")
fileobj = open("lib/html/page"+str(page)+".html", "w", encoding="utf-8")
fileobj.write(html)
fileobj.close()
print("页面"+str(page)+"保存成功")
def getPageCount(baseurl):
pagecount = 0
html = getHTML(baseurl)
print("正在获取页面总页数......")
soup = BeautifulSoup(html, "html.parser")
ul = soup.find("ul", id="browserItemList")
li = ul.find("li")
if li is None:
pagecount = 0
else:
div = soup.find("div", id="multipage")
span = div.find("span", class_="p_edge")
if span is not None:
result = re.findall(r'[0-9]+', span.string)
pagecount = int(result[1])
else:
alist = div.find_all("a")
if len(alist) != 0:
pagecount = int(alist[len(alist) - 2].string)
else:
pagecount = 1
print("页面总数获取成功,页面总数为{}页".format(pagecount))
return pagecount
def changeLink(html):
print("正在修改相关链接......")
soup = BeautifulSoup(html, "html.parser")
a_list = soup.find_all("a")
for i in a_list:
if 'href' in i.attrs and re.match(r'/[^\s]*', i['href']) is not None:
i['href'] = "https://bangumi.tv" + i['href']
link_list = soup.find_all("link")
for i in link_list:
if 'href' in i.attrs and re.match(r'/[^\s]*', i['href']) is not None:
i['href'] = "https://bangumi.tv" + i['href']
script_list = soup.find_all("script")
for i in script_list:
if 'src' in i.attrs and re.match(r'/[^\s]*', i['src']) is not None:
i['src'] = "https://bangumi.tv" + i['src']
form_list = soup.find_all("form")
for i in form_list:
if 'action' in i.attrs and re.match(r'/[^\s]*', i['action']) is not None:
i['action'] = "https://bangumi.tv" + i['action']
img_list = soup.find_all("img")
for i in img_list:
if 'src' in i.attrs and re.match(r'//[^\s]*', i['src']) is not None:
i['src'] = "https:" + i['src']
if soup.find("li", id="item_7157") is not None:
img = soup.find("li", id="item_7157").find("img")
img['src'] = "https://lain.bgm.tv/pic/cover/s/6e/01/7157_QV8Rz.jpg"
span = soup.find("span", class_="avatarNeue")
span['style'] = re.sub(r'//[^\s]*', "https:" + re.search(r'//[^\s]*', span['style']).group(), span['style'])
div = soup.find("div", id="robot_speech")
a = div.find("a", class_="nav")
a['href'] = "https://bangumi.tv/" + a['href']
print("相关链接修改成功")
return soup.prettify()
def toLocal():
pass
def getData(baseurl, pagecount):
datalist = []
for i in range(1, pagecount+1):
url = baseurl + "?page=" + str(i)
html = getHTML(url)
html = changeLink(html)
saveHTML(html, i)
print("开始爬取解析页面"+str(i))
soup = BeautifulSoup(html, "html.parser")
all_animation = soup.find("ul", id="browserItemList")
for item in all_animation.find_all("li"):
data = []
idd = re.search(r'[0-9]+', item['id']).group()
data.append(idd)
print("正在解析动画(id:{})数据......".format(idd))
chinese_name = item.find("a", class_="l").string.strip()
data.append(chinese_name)
if item.find("small", class_="grey") is None:
original_name = ""
else:
original_name = item.find("small", class_="grey").string.strip()
data.append(original_name)
info = item.find("p", class_="info tip").string
episodes = re.search(r'[0-9]+', info).group()
if re.search(r'[\d]+年[\d]+月[\d]+日', info) is not None:
broadcast_time = re.search(r'[\d]+年[\d]+月[\d]+日', info).group()
broadcast_time = re.sub(r'[^\d]+', "-", broadcast_time).strip("-")
elif re.search(r'[\d]+-[\d]+-[\d]+', info) is not None:
broadcast_time = re.search(r'[\d]+-[\d]+-[\d]+', info).group()
else:
broadcast_time = ""
if re.search(r'日.+', info) is None:
people = ""
else:
people = re.search(r'日.+', info).group()
people = people[4:].strip()
data.append(episodes)
data.append(broadcast_time)
data.append(people)
star_time = item.find("span", class_="tip_j").string.strip()
data.append(star_time)
score = item.find("span", class_="starlight")['class'][1]
score = re.search(r'[0-9]+', score).group()
data.append(score)
if item.find("span", class_="tip") is None:
tag = ""
else:
tag = item.find("span", class_="tip").string
tag = tag.strip()[4:]
data.append(tag)
page_url = item.find("a", class_="l")['href']
data.append(page_url)
print("正在下载缩略封面图{}.jpg".format(idd))
low_image_url = item.find("img", class_="cover")['src']
data.append(low_image_url)
low_image_path = "lib/image/low/" + idd + ".jpg"
data.append(low_image_path)
low_image_url = re.sub(r'lain.bgm', "bangumi", low_image_url)
urllib.request.urlretrieve(low_image_url, low_image_path)
print("正在下载高清封面图{}.jpg".format(idd))
high_image_url = re.sub(r'/s/', "/l/", low_image_url)
data.append(high_image_url)
high_image_path = "lib/image/high/" + idd + ".jpg"
data.append(high_image_path)
datalist.append(data)
print("页面{}爬取解析成功".format(str(i)))
return datalist
def saveDataToDatabase(datalist):
print("开始将数据保存至数据库")
con = pymysql.connect(host="localhost", database="web_crawler", user="root", password="root")
cur = con.cursor()
sql = "insert into animation values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
try:
for data in datalist:
print("正在保存动画(id:{})数据至数据库......".format(data[0]))
cur.execute(sql, tuple(data))
con.commit()
except:
con.rollback()
print("数据保存失败")
else:
print("数据保存成功")
cur.close()
con.close()
def saveDataToExcel(datalist):
print("开始将数据保存至excel表")
book = xlwt.Workbook(encoding="utf-8")
sheet = book.add_sheet("我看过的动画")
colname = ("ID", "中文名", "原名", "话数", "放送开始时间", "导演/原作者/等制作人", "收藏时间", "个人评分", "个人标签", "页面网址", "缩略封面图网址", "缩略封面图本地路径", "高清封面图网址", "高清封面图本地路径")
style = xlwt.easyxf('font: bold on')
for i in range(0, 14):
sheet.write(0, i, colname[i], style)
for i in range(0, len(datalist)):
data = datalist[i]
print("正在保存动画(id:{})数据至excel表......".format(data[0]))
for j in range(0, 14):
sheet.write(i + 1, j, data[j])
book.save("lib/excel/Bangumi-我看过的动画.xls")
print("数据保存成功")
if __name__ == "__main__":
main()
运行结果
代码编写完成后运行程序,程序运行过程中会在控制台实时输出当前爬取进度
爬取完毕后,可看到成功导出html文件
成功下载封面图片
信息成功保存至数据库
成功保存至Excel表格
最后JavaWeb程序也成功展示爬取的数据
; 后记
本程序仅供学习和参考,请勿抄袭或另作他用。
感谢观看,有什么问题可在下方评论区进行评论,若觉得本文章写得不错,还请点个赞呢。
关注我,收看更多精彩!( • ̀ω•́ )✧求点赞、评论、收藏、关注
Original: https://blog.csdn.net/XiuMu_0216/article/details/125935768
Author: 朽木冰天
Title: Python大作业-网络爬虫程序
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/726496/
转载文章受原作者版权保护。转载请注明原作者出处!