Python实现对中文文本分句

基于该思路的文本相似度对比程序

问题

实现对文本的分句,大致来说主要是以中文的句号、感叹、问号等符号进行分句。难点在于直接分句可能会造成人物说话的语句也被分开!

步骤

  1. 分段
    首先读取文本,文本读取后整体是一个字符串,每一个段之间是空白,所以分段之间按照空白分开来即可,最后存入一个paragraph_list,注意该list的下标就是段落的顺序号!其他的这里就不再多赘述!(可以查看最后的整体代码)
  2. 分句
    首先拿到上面分好的paragraph_list,循环拿到每一段,然后对每一段直接按照分句规则(正则表达式)进行分句,参考该文章
import re

def cut_sent(para):
    para = re.sub('([。!?\?])([^"'])', r"\1\n\2", para)
    para = re.sub('(\.{6})([^"'])', r"\1\n\2", para)
    para = re.sub('(\...{2})([^"'])', r"\1\n\2", para)
    para = re.sub('([。!?\?]["'])([^,。!?\?])', r'\1\n\2', para)
    para = para.rstrip()
    return para.split("\n")

s = '今天天气好啊!' \
    '温度高吗?你好,很高兴遇见你,真不错。' \
    '小明遇见小红说:"你的衣服这好看!"' \
    '小红说:"什么?衣服真好看?真的吗?"' \
    '小明回答到:"嗯,真的!我也想买。"'

for i in cut_sent(s):
    print(i)

"""
今天天气好啊!
温度高吗?
你好,很高兴遇见你,真不错。
小明遇见小红说:"你的衣服这好看!
"小红说:"什么?
衣服真好看?
真的吗?
"小明回答到:"嗯,真的!
我也想买。
"
"""
  1. 连接
    这里解决办法就是循环每一句,识别 :""
  2. 两个符号均有,则该句直接就是一整句,直接就加入
  3. 两个符号都没有,则该句直接就是一整句,直接就加入
  4. 如果只有前面符号而无后面符号,则记录有前面符号那一句,依次往下拼接,直到遇到字符最后有 “,将上面拼接好的语句作为一整句放入
def connect(paragraph):
    sentence_before = []
    sentence_after = []
    for each_para in paragraph:
        sentence_before.append(cut(each_para))

    for each in sentence_before:
        list = []
        sentence = ""
        FLAG = True
        for i in each:
            if i.find(':"') * i.find('"') >= 0 and FLAG:
                list.append(i + sentence)
            else:
                FLAG = False
                sentence = sentence + i
                if i.find('"') > 0:
                    list.append(sentence)
                    sentence = ""
                    FLAG = True
        sentence_after.append(list)
    return sentence_after

最后整体代码

我的文本资源
链接:https://pan.baidu.com/s/16iYli6F-IsNkEwO3L2Z90g
密码:vmc2

import re
import pandas as pd

def segments(url):
    raw = pd.read_csv(url,names=['txt'], sep='aaa', encoding="GBK" ,engine='python')

    def m_head(tem_str):
        return tem_str[:1]

    def m_mid(tmp_str):
        return tmp_str.find("回 ")
    raw['head'] = raw.txt.apply(m_head)
    raw['mid'] = raw.txt.apply(m_mid)
    raw['len'] = raw.txt.apply(len)
    chap_num = 0
    for i in range(len(raw)):
        if raw['head'][i] == "第" and raw['mid'][i] > 0 and raw['len'][i] < 30:
            chap_num += 1
        if chap_num >= 40 and raw['txt'][i] == "附录一:成吉思汗家族":
            chap_num = 0
        raw.loc[i, 'chap'] = chap_num
    del raw['head']
    del raw['mid']
    del raw['len']
    tmp_chap = raw[raw['chap'] == 7].copy()
    tmp_chap.reset_index(drop=True, inplace=True)
    tmp_chap['paraidx'] = tmp_chap.index
    paragraph = tmp_chap['txt'].values.tolist()
    return paragraph

def cut(para):

    pattern = ['([。!?\?])([^"'])','(\.{6})([^"'])','(\...{2})([^"'])','([。!?\?]["'])([^,。!?\?])']
    for i in pattern:
        para = re.sub(i, r"\1\n\2", para)
    para = para.rstrip()
    return para.split("\n")

def connect(paragraph):
    sentence_before = []
    sentence_after = []
    for each_para in paragraph:
        sentence_before.append(cut(each_para))

    for each in sentence_before:
        list = []
        sentence = ""
        FLAG = True
        for i in each:
            if i.find(':"') * i.find('"') >= 0 and FLAG:
                list.append(i + sentence)
            else:
                FLAG = False
                sentence = sentence + i
                if i.find('"') > 0:
                    list.append(sentence)
                    sentence = ""
                    FLAG = True
        sentence_after.append(list)
    return sentence_after

def toDataFrame(list3):
    df = pd.DataFrame(columns=["content","paragraph","sentence"])
    for para_num,i in enumerate(list3):
       for sentence_num,j in enumerate(i):
            df_ = pd.DataFrame({"content": j, "paragraph": para_num,"sentence":sentence_num+1},index=[para_num])
            df = df.append(df_,ignore_index=True)
    for i in df['content'].values.tolist():
        print(i)

def main():

    URL = input("请输入文件地址:")
    para = segments(URL)
    result = connect(para)
    print(result)
    flag = input("以DataFrame形式输出数据(Y,N):")
    if flag == 'Y':
        toDataFrame(result)
    elif flag == 'N':
        print("Thanks!!!!")
    else:
        print("程序结束!请检查的你的输入!")

if __name__ == '__main__':
    main()

有任何疑问,学习交流请直接联系我的邮箱:
d_zhao_work@163.com

Original: https://blog.csdn.net/weixin_43495948/article/details/114522172
Author: ccgkk
Title: Python实现对中文文本分句

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/531604/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球