对豆瓣进行爬虫来获取相关数据(分别保存到Excel表格和sqlite中)

1.存入Excel表格的代码:


from bs4 import BeautifulSoup #网页解析,获取数据
import re  #正则表达式,进行文字匹配
import urllib.request,urllib.error,urllib.parse  #制定URL,获取网页数据
import xlwt  #进行excel操作
import sqlite3 #进行SQLLite数据库操作

def main():
    baseurl="https://movie.douban.com/top250?start="
    #1.爬取网页
    datalist=getData(baseurl)
    savepath="豆瓣电影Top250.xls"
    #3.保存数据
    saveData(datalist,savepath)
    askURL("https://movie.douban.com/top250?start=")
#影片详情链接的规则
findLink = re.compile(r'')  #创建正则表达式,表示规则(字符串的模式)
#影片图片
findImgSrc = re.compile(r'(.*)')
#影片评分
findRating = re.compile(r'(.*)')
#观看人数
findJudge = re.compile(r'(\d*)人评价')
#找到概况
findInq = re.compile(r'(.*)')

#找到影片的相关内容
findBd = re.compile(r'(.*?)',re.S)

#爬取网页
def getData(baseurl):
    datalist=[]
    for i in range(0,10):   #调用获取页面信息的函数 10次
        url =baseurl + str(i*25)
        html = askURL(url)
    #2.逐一解析数据
        soup =BeautifulSoup(html,"html.parser")#形成树形结构
        for item in soup.find_all('div',class_="item"): #查找符合要求的字符串,形成列表
            data = [] #保存一部电影的所有信息
            item = str(item)
            #link获取到影片详情
            link = re.findall(findLink,item)[0]   #re库用来通过正则表达式查找指定的字符串
            data.append(link)
            imgSrc = re.findall(findImgSrc,item)[0]
            data.append(imgSrc)#添加图片
            titles = re.findall(findTitle,item) #
            if(len(titles) == 2):
                ctitle = titles[0]              #添加中文名
                data.append(ctitle)
                otitle = titles[1].replace("/","")      #添加外国名
                data.append(otitle)
            else:
                data.append(titles[0])
                data.append(' ')#外国名留空
            rating = re.findall(findRating,item)[0]
            data.append(rating)#添加评分
            judgeNum = re.findall(findJudge,item)[0]
            data.append(judgeNum)#添加评价人数
            inq = re.findall(findInq,item)
            if len(inq) != 0:
                inq = inq[0].replace("。","")    #去掉句号
                data.append(inq)#添加概述
            else:
                data.append(" ")        #留空
            # data.append(inq)
            bd = re.findall(findBd,item)[0]
            bd = re.sub('(\s+)?',"",bd)       #去掉
            bd = re.sub('/'," ",bd)         #替换
            data.append(bd.strip())             #去掉前后空格
            datalist.append(data) #处理好的一部电影信息

    print(datalist)
    return datalist
#得到指定一个URL的网页内容
def askURL(url):
    head = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
    }
    request = urllib.request.Request(url,headers=head)
    html=""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        # print(html)
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
          print(e.code)
        if hasattr(e,"reason"):
          print(e.reason)
    return html

#保存数据
def saveData(datalist,savepath):
    print("save,,,,")
    book = xlwt.Workbook(encoding="uft-8",style_compression=0)  # 创建workbook对象
    sheet = book.add_sheet("sheet1",cell_overwrite_ok=True)  # 创建工作表
    col = ("电影详情连接","图片链接","影片中文名","影片外国名","评分","评价人数","概况","相关信息")
    for i in range(0,8):
        sheet.write(0,i,col[i]) #列名
    for i in range(0,250):
        print("第%d条"%i)
        data = datalist[i]
        for j in range(0,8):
            sheet.write(i+1,j,data[j])   #数据
    book.save(savepath)  # 保存数据表

if __name__ == "__main__": #当程序执行时
    main()
    print("爬取完毕")

最终结果(excel表格):

对豆瓣进行爬虫来获取相关数据(分别保存到Excel表格和sqlite中)

2.利用sqlite数据库进行存取

sqlite数据库连接的常用语句

import sqlite3#建立数据库
conn = sqlite3.connect("test.db")  #打开或创建数据库文件
print("Opened database successfully")
c = conn.cursor()       #获取游标

 sql = '''需要填写的SQL语句
 '''

c.execute(sql)     #执行SQL语句
conn.commit()      #提交数据库操作
conn.close()       #关闭数据库

from bs4 import BeautifulSoup #网页解析,获取数据
import re  #正则表达式,进行文字匹配
import urllib.request,urllib.error,urllib.parse  #制定URL,获取网页数据
import xlwt  #进行excel操作
import sqlite3 #进行SQLLite数据库操作

def main():
    baseurl="https://movie.douban.com/top250?start="
    #1.爬取网页
    datalist=getData(baseurl)
    #设置保存路径 dbpath为保存到sqlite中,savepath保存到Excel表格
    dbpath ="moive.db"
    #savepath="豆瓣电影Top250.xls"
    #3.保存数据  saveData(datalist,savepath)保存到Excel表格,
    #saveData(datalist,savepath)

    saveData2(datalist,dbpath)
    askURL("https://movie.douban.com/top250?start=")
#影片详情链接的规则
findLink = re.compile(r'<a href="(.*?)">')  #&#x521B;&#x5EFA;&#x6B63;&#x5219;&#x8868;&#x8FBE;&#x5F0F;&#xFF0C;&#x8868;&#x793A;&#x89C4;&#x5219;&#xFF08;&#x5B57;&#x7B26;&#x4E32;&#x7684;&#x6A21;&#x5F0F;&#xFF09;
#&#x5F71;&#x7247;&#x56FE;&#x7247;
findImgSrc = re.compile(r'<img.*src="(.*?)"',re.s)#re.s让换行符出现在字符中 #影片片名 findtitle="re.compile(r'<span" class="title">(.*)')
#&#x5F71;&#x7247;&#x8BC4;&#x5206;
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
#&#x89C2;&#x770B;&#x4EBA;&#x6570;
findJudge = re.compile(r'<span>(\d*)&#x4EBA;&#x8BC4;&#x4EF7;</span>')
#&#x627E;&#x5230;&#x6982;&#x51B5;
findInq = re.compile(r'<span class="inq">(.*)</span>')

#&#x627E;&#x5230;&#x5F71;&#x7247;&#x7684;&#x76F8;&#x5173;&#x5185;&#x5BB9;
findBd = re.compile(r'<p class>(.*?)</p>',re.S)

#&#x722C;&#x53D6;&#x7F51;&#x9875;
def getData(baseurl):
    datalist=[]
    for i in range(0,10):   #&#x8C03;&#x7528;&#x83B7;&#x53D6;&#x9875;&#x9762;&#x4FE1;&#x606F;&#x7684;&#x51FD;&#x6570; 10&#x6B21;
        url =baseurl + str(i*25)
        html = askURL(url)
    #2.&#x9010;&#x4E00;&#x89E3;&#x6790;&#x6570;&#x636E;
        soup =BeautifulSoup(html,"html.parser")#&#x5F62;&#x6210;&#x6811;&#x5F62;&#x7ED3;&#x6784;
        for item in soup.find_all('div',class_="item"): #&#x67E5;&#x627E;&#x7B26;&#x5408;&#x8981;&#x6C42;&#x7684;&#x5B57;&#x7B26;&#x4E32;&#xFF0C;&#x5F62;&#x6210;&#x5217;&#x8868;
            data = [] #&#x4FDD;&#x5B58;&#x4E00;&#x90E8;&#x7535;&#x5F71;&#x7684;&#x6240;&#x6709;&#x4FE1;&#x606F;
            item = str(item)
            #link&#x83B7;&#x53D6;&#x5230;&#x5F71;&#x7247;&#x8BE6;&#x60C5;
            link = re.findall(findLink,item)[0]   #re&#x5E93;&#x7528;&#x6765;&#x901A;&#x8FC7;&#x6B63;&#x5219;&#x8868;&#x8FBE;&#x5F0F;&#x67E5;&#x627E;&#x6307;&#x5B9A;&#x7684;&#x5B57;&#x7B26;&#x4E32;
            data.append(link)
            imgSrc = re.findall(findImgSrc,item)[0]
            data.append(imgSrc)#&#x6DFB;&#x52A0;&#x56FE;&#x7247;
            titles = re.findall(findTitle,item) #
            if(len(titles) == 2):
                ctitle = titles[0]              #&#x6DFB;&#x52A0;&#x4E2D;&#x6587;&#x540D;
                data.append(ctitle)
                otitle = titles[1].replace("/","")      #&#x6DFB;&#x52A0;&#x5916;&#x56FD;&#x540D;
                data.append(otitle)
            else:
                data.append(titles[0])
                data.append(' ')#&#x5916;&#x56FD;&#x540D;&#x7559;&#x7A7A;
            rating = re.findall(findRating,item)[0]
            data.append(rating)#&#x6DFB;&#x52A0;&#x8BC4;&#x5206;
            judgeNum = re.findall(findJudge,item)[0]
            data.append(judgeNum)#&#x6DFB;&#x52A0;&#x8BC4;&#x4EF7;&#x4EBA;&#x6570;
            inq = re.findall(findInq,item)
            if len(inq) != 0:
                inq = inq[0].replace("&#x3002;","")    #&#x53BB;&#x6389;&#x53E5;&#x53F7;
                data.append(inq)#&#x6DFB;&#x52A0;&#x6982;&#x8FF0;
            else:
                data.append(" ")        #&#x7559;&#x7A7A;
            # data.append(inq)
            bd = re.findall(findBd,item)[0]
            bd = re.sub('<br(\s+)?>(\s+)?',"",bd)       #&#x53BB;&#x6389;
            bd = re.sub('/'," ",bd)         #&#x66FF;&#x6362;
            data.append(bd.strip())             #&#x53BB;&#x6389;&#x524D;&#x540E;&#x7A7A;&#x683C;
            datalist.append(data) #&#x5904;&#x7406;&#x597D;&#x7684;&#x4E00;&#x90E8;&#x7535;&#x5F71;&#x4FE1;&#x606F;

    #print(datalist) #&#x6253;&#x5370;&#x4FDD;&#x5B58;&#x7684;&#x6570;&#x636E;
    return datalist
#&#x5F97;&#x5230;&#x6307;&#x5B9A;&#x4E00;&#x4E2A;URL&#x7684;&#x7F51;&#x9875;&#x5185;&#x5BB9;
def askURL(url):
    head = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
    }
    request = urllib.request.Request(url,headers=head)
    html=""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        # print(html)
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
          print(e.code)
        if hasattr(e,"reason"):
          print(e.reason)
    return html

#&#x4FDD;&#x5B58;&#x6570;&#x636E;
def saveData(datalist,savepath):
    print("save,,,,")
    book = xlwt.Workbook(encoding="uft-8",style_compression=0)  # &#x521B;&#x5EFA;workbook&#x5BF9;&#x8C61;
    sheet = book.add_sheet("sheet1",cell_overwrite_ok=True)  # &#x521B;&#x5EFA;&#x5DE5;&#x4F5C;&#x8868;
    col = ("&#x7535;&#x5F71;&#x8BE6;&#x60C5;&#x8FDE;&#x63A5;","&#x56FE;&#x7247;&#x94FE;&#x63A5;","&#x5F71;&#x7247;&#x4E2D;&#x6587;&#x540D;","&#x5F71;&#x7247;&#x5916;&#x56FD;&#x540D;","&#x8BC4;&#x5206;","&#x8BC4;&#x4EF7;&#x4EBA;&#x6570;","&#x6982;&#x51B5;","&#x76F8;&#x5173;&#x4FE1;&#x606F;")
    for i in range(0,8):
        sheet.write(0,i,col[i]) #&#x5217;&#x540D;
    for i in range(0,250):
        print("&#x7B2C;%d&#x6761;"%i)
        data = datalist[i]
        for j in range(0,8):
            sheet.write(i+1,j,data[j])   #&#x6570;&#x636E;
    book.save(savepath)  # &#x4FDD;&#x5B58;&#x6570;&#x636E;&#x8868;

def saveData2(datalist,dbpath):
    init_db(dbpath)
    conn = sqlite3.connect(dbpath)
    cur = conn.cursor()

    for data in datalist:
        for index in range(len(data)):
            if index == 4 or index == 5:
                continue
            data[index] = '"'+data[index]+'"'
        sql = '''
        insert into movie250(
        info_link,pic_link,cname,ename,sorce,rated,instroduction,info)
        values(%s)
        '''%",".join(data)
        print(sql)
        cur.execute(sql)
        conn.commit()

    cur.close()
    conn.close()
    print("....")

def init_db(dbpath):
    sql = '''
    create table movie250
    (id integer primary key autoincrement,
    info_link text,
    pic_link text,
    cname varchar,
    ename varchar,
    sorce numeric,
    rated numeric,
    instroduction text,
    info text
    )
    ''' #&#x521B;&#x5EFA;&#x6570;&#x636E;&#x8868;
    conn = sqlite3.connect(dbpath)#&#x6253;&#x5F00;&#x6216;&#x521B;&#x5EFA;&#x6570;&#x636E;&#x5E93;&#x6587;&#x4EF6;
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    conn.close()

if __name__ == "__main__": #&#x5F53;&#x7A0B;&#x5E8F;&#x6267;&#x884C;&#x65F6;
    main()

    print("&#x722C;&#x53D6;&#x5B8C;&#x6BD5;")
</br(\s+)?></img.*src="(.*?)"',re.s)#re.s让换行符出现在字符中></a>

运行结果:

对豆瓣进行爬虫来获取相关数据(分别保存到Excel表格和sqlite中)

Original: https://blog.csdn.net/qq_42514371/article/details/122563279
Author: 脱发的小猿
Title: 对豆瓣进行爬虫来获取相关数据(分别保存到Excel表格和sqlite中)

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/815870/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球