【Python】数据分析、爬取PDF文件使用Jieba词库进行分析计算银行数字化转型指数

这里我使用Jieba对转换好的txt文档读取分词,在此基础上,根据词库对上市银行年度报表进行匹配与词频汇总,同时剔除关键词前存在否定表达的词频后进行对数化得到银行业数字化转型指数。

源代码如下:

导入依赖
import jieba
import numpy as np

text_paths = r'兴业银行2021 年 年 度 报 告'
text_path = f'银行\\兴业银行\\{text_paths}.pdf'
text_path2 = f'银行\\兴业银行\\TXT\\{text_paths}'

def fun():
    # 读取文本
    txt = open(f"{text_path2}.txt", "r", encoding='utf-8').read()
    # 使用精确模式对文本进行分词
    words = jieba.lcut(txt)
    # 通过键值对的形式存储词语及其出现的次数
    counts = {}
    for word in words:
        # 去掉词语中的空格
        word = word.replace('  ', '')
        # 如果词语长度为1,则忽略统计
        if len(word) == 1:
            continue
        # 进行累计
        else:
            counts[word] = counts.get(word, 0) + 1
    # # 将字典转为列表
    # items = list(counts.items())
    # # 根据词语出现的次数进行从大到小排序
    # items.sort(key=lambda x: x[1], reverse=True)

    # 查找指数词(自定义词库)
    cKu = ["人工智能","网联","平台","智能穿戴","智慧农业","智能风控","智能交通","智能医疗","智能客服","智能投顾","智能柜台","数字营销","数字金融","Fintech","金融科技","量化金融","开放银行","API","网银","私人银行","场景","供应链金融","数字普惠金融","互联网金融","手机银行","APP","NFC支付","移动支付","手机支付","第三方支付","电子商务","私人银行","B2B","B2C","C2B","C2C","O2O","大数据","数字挖掘","信息科技","文本挖掘","数据可视化","异构数据","征信","增强现实","混合现实","虚拟现实","数据分析","IT","物联网","信息物理系统","私有云","公有云","云计算","流计算","图计算","内存计算","多方安全计算","类脑计算","绿色计算","认知计算","融合架构","亿级并发","EB级存储","区块链","数字货币","分布式记账","分布式计算","差分隐私技术","智能金融合约","商业智能","图像理解","投资决策辅助系统","智能数据分析","共享","机器学习","语义搜索","生物识别技术","人脸识别","语音识别","身份验证","人物画像","精准匹配","定制","敏捷化"]
    nums = 0

    for wd in cKu:
        for word,val in counts.items():
            # print(f"{word} = {val}")
            if wd == word:
                nums += val
    print(f"词频数: {nums}")
    print("取对数后: {:.4f}".format(np.log(nums)))
    # 结尾保留了4位小数

主函数
if __name__ == '__main__':
    fun()

三、终制版代码

要实现需求就需要对上述两种操作分别先后进行,比较不便,为了更加方便大量统计、计算并使用,将上述两模块结合在一起,就是终制版

源代码如下:

import pyocr
import importlib
import sys
import time
import jieba
import numpy as np

importlib.reload(sys)
time1 = time.time()
print("初始时间为:",time1)

import os.path
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed

text_paths = r'南京2020 年年度报告'
text_path = f'银行\\南京银行\\{text_paths}.pdf'
text_path2 = f'银行\\南京银行\\TXT\\{text_paths}'

def parse():
    '''解析PDF文本,并保存到TXT文件中'''
    print("------开始转换------")

    fp = open(text_path, 'rb')
    # 用文件对象创建一个PDF文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器,与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码,如果没有密码,就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDF,资源管理器,来共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释其对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一个page内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
            # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            # 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(f'{text_path2}.txt', 'a',encoding='utf-8') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + "\n")
                    f.close()
    print("------转换完成------")

------------------二---------------

def fun():
    # 读取文本
    txt = open(f"{text_path2}.txt", "r", encoding='utf-8').read()
    # 使用精确模式对文本进行分词
    words = jieba.lcut(txt)
    # 通过键值对的形式存储词语及其出现的次数
    counts = {}
    for word in words:
        # 去掉词语中的空格
        word = word.replace('  ', '')
        # 如果词语长度为1,则忽略统计
        if len(word) == 1:
            continue
        # 进行累计
        else:
            counts[word] = counts.get(word, 0) + 1
    # # 将字典转为列表
    # items = list(counts.items())
    # # 根据词语出现的次数进行从大到小排序
    # items.sort(key=lambda x: x[1], reverse=True)

    # 查找指数词(自定义词库)
    cKu = ["人工智能","网联","平台","智能穿戴","智慧农业","智能风控","智能交通","智能医疗","智能客服","智能投顾","智能柜台","数字营销","数字金融","Fintech","金融科技","量化金融","开放银行","API","网银","私人银行","场景","供应链金融","数字普惠金融","互联网金融","手机银行","APP","NFC支付","移动支付","手机支付","第三方支付","电子商务","私人银行","B2B","B2C","C2B","C2C","O2O","大数据","数字挖掘","信息科技","文本挖掘","数据可视化","异构数据","征信","增强现实","混合现实","虚拟现实","数据分析","IT","物联网","信息物理系统","私有云","公有云","云计算","流计算","图计算","内存计算","多方安全计算","类脑计算","绿色计算","认知计算","融合架构","亿级并发","EB级存储","区块链","数字货币","分布式记账","分布式计算","差分隐私技术","智能金融合约","商业智能","图像理解","投资决策辅助系统","智能数据分析","共享","机器学习","语义搜索","生物识别技术","人脸识别","语音识别","身份验证","人物画像","精准匹配","定制","敏捷化"]
    nums = 0

    for wd in cKu:
        for word,val in counts.items():
            # print(f"{word} = {val}")
            if wd == word:
                nums += val
    print(f"词频数: {nums}")
    print("取对数后: {:.4f}".format(np.log(nums)))

if __name__ == '__main__':
    parse()
    time2 = time.time()
    print("总共消耗时间为:", time2 - time1)
    fun()

当然,在读取操作目标名字方法main仍有诸多不便,因此可以加上读取文件夹下所有文件名,再使用列表将其循环遍历操作便可解决这个问题,后续有时间可以再写一写

读取指定路径下所有文件:

import os

filePath = 'D:\\pythonProject\\数据分析\\银行\\兴业银行'
文件路径
fileNames = os.listdir(filePath)
获取的路径下文件名称列表形式存到fileNames
print(fileNames)
打印
print('---------')

for name in fileNames:
    print(name)

四、追更:

原来import的对PDF有关的包做了调整,故现在报错,更新如下:(2023-1-9)

import os.path
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed

改后:

import os.path
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument  import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfpage  import PDFTextExtractionNotAllowed

完整代码:

import pyocr
import importlib
import sys
import time
import jieba
import numpy as np

importlib.reload(sys)
time1 = time.time()
print("初始时间为:",time1)

import os.path
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument  import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfpage  import PDFTextExtractionNotAllowed

text_paths = r'南京2020 年年度报告'
text_path = f'银行\\南京银行\\{text_paths}.pdf'
text_path2 = f'银行\\南京银行\\TXT\\{text_paths}'

def parse():
    '''解析PDF文本,并保存到TXT文件中'''
    print("------开始转换------")

    fp = open(text_path, 'rb')
    # 用文件对象创建一个PDF文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器,与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码,如果没有密码,就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDF,资源管理器,来共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释其对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一个page内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
            # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            # 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(f'{text_path2}.txt', 'a',encoding='utf-8') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + "\n")
                    f.close()
    print("------转换完成------")

------------------二---------------

def fun():
    # 读取文本
    txt = open(f"{text_path2}.txt", "r", encoding='utf-8').read()
    # 使用精确模式对文本进行分词
    words = jieba.lcut(txt)
    # 通过键值对的形式存储词语及其出现的次数
    counts = {}
    for word in words:
        # 去掉词语中的空格
        word = word.replace('  ', '')
        # 如果词语长度为1,则忽略统计
        if len(word) == 1:
            continue
        # 进行累计
        else:
            counts[word] = counts.get(word, 0) + 1
    # # 将字典转为列表
    # items = list(counts.items())
    # # 根据词语出现的次数进行从大到小排序
    # items.sort(key=lambda x: x[1], reverse=True)

    # 查找指数词(自定义词库)
    cKu = ["人工智能","网联","平台","智能穿戴","智慧农业","智能风控","智能交通","智能医疗","智能客服","智能投顾","智能柜台","数字营销","数字金融","Fintech","金融科技","量化金融","开放银行","API","网银","私人银行","场景","供应链金融","数字普惠金融","互联网金融","手机银行","APP","NFC支付","移动支付","手机支付","第三方支付","电子商务","私人银行","B2B","B2C","C2B","C2C","O2O","大数据","数字挖掘","信息科技","文本挖掘","数据可视化","异构数据","征信","增强现实","混合现实","虚拟现实","数据分析","IT","物联网","信息物理系统","私有云","公有云","云计算","流计算","图计算","内存计算","多方安全计算","类脑计算","绿色计算","认知计算","融合架构","亿级并发","EB级存储","区块链","数字货币","分布式记账","分布式计算","差分隐私技术","智能金融合约","商业智能","图像理解","投资决策辅助系统","智能数据分析","共享","机器学习","语义搜索","生物识别技术","人脸识别","语音识别","身份验证","人物画像","精准匹配","定制","敏捷化"]
    nums = 0

    for wd in cKu:
        for word,val in counts.items():
            # print(f"{word} = {val}")
            if wd == word:
                nums += val
    print(f"词频数: {nums}")
    print("取对数后: {:.4f}".format(np.log(nums)))

if __name__ == '__main__':
    parse()
    time2 = time.time()
    print("总共消耗时间为:", time2 - time1)
    fun()

但是我还是有问题各类大大小小的问题,故也可使用这一块的代码【缺点是效率低】:

coding:utf-8
import os
import re
from pdfminer.converter import LTChar, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox
from io import StringIO
from io import open

#读取pdf文件文本内容
def read(path):
    parser = PDFParser(path)
    doc = PDFDocument(parser, '')
    parser.set_document(doc)
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF聚合器,包含资源管理器与参数分析器
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 循环遍历列表,每次处理一个page的内容
        page0 = ''
        for i, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            print("START PAGE %d\n" % i)
            if page is not None:
                interpreter.process_page(page)
            print("END PAGE %d\n" % i)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            print(layout)
            # 这里layout是一个LTPage对象,里面存放着这个 page 解析出的各种对象
            # 包括 LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等
            line0 = ''
            for x in layout:
                if isinstance(x, LTTextBox):
                    line0 = line0 + x.get_text().strip()
            page0 = page0 + line0
        return page0 #返回pdf文件中所有提取到的文本内容

if __name__ == '__main__':
    path = '需要找DF的银行'
    pdfList = os.listdir(path)
    #批量读取存储
    pdf_num = 0
    for li in pdfList:
        try:
            pdffile = open(path + '/' + li, "rb")
            content = read(pdffile)
        except:
            continue
        str = re.sub('.pdf', '.txt', li)
        file1 = 'txt1/' + str
        with open(file1, 'w+', encoding='utf8') as f:
            f.write(content)
        pdf_num = pdf_num + 1
        # handleData(str)
        print("DONE:" + str )
    print('number of done-article:',end = "")
    print(pdf_num)

终于成功解决了转换问题!!

后续完整代码有时间再更新…

Original: https://www.cnblogs.com/HanaKoo/p/16490282.html
Author: HanaKoo
Title: 【Python】数据分析、爬取PDF文件使用Jieba词库进行分析计算银行数字化转型指数

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/568330/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球