Python分析文本长度和句子个数的代码参考

import glob
import os
import json
from SplitIntoSentences import split_into_sentences
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.pyplot import MultipleLocator
from itertools import chain

一、将训练集的作者id和对应段落整理到一个文档，统计段落总数并检测训练集中是否缺失标签
def task_one():
    input_train = '../pan21/train'
    output_analysis = 'data_analysis'
    corpora = glob.glob(input_train + '/*.txt')
    index = 0
    para_num_dict = {}
    para_dict = {}
    paras = {}
    for document_path in corpora:
        # 读取每一个文本并赋予对应id
        file = open(document_path, 'r', encoding="utf-8")
        document = file.read()
        share_id = os.path.basename(document_path)[8:-4]
        paragraphs = document.split('\n')
        para_num_dict[share_id] = len(paragraphs)
        paras[share_id] = paragraphs
    for i in range(1, 11201):
        for paragraph in paras[str(i)]:
            para_dict[index] = paragraph
            index += 1
    lost_label_document_id = []
    truth_label = read_ground_truth_files(input_train)
    author_list = []
    for num in range(1, 11201):
        author_ids = truth_label[str(num)]["structure"]
        author_local_change = truth_label[str(num)]['changes']
        if len(author_local_change)+1 != para_num_dict[str(num)]:
            lost_label_document_id.append(num)
            author_local_change.append(0)
        author_list.append(author_ids.pop(0))
        for author_change in author_local_change:
            if author_change == 0:
                author_list.append(author_list[-1])
            else:
                author_list.append(author_ids.pop(0))

    result = "number of paragraphs: {}\nnumber of author label: {}\ndocument id of lost label: {}". \
        format(len(para_dict.keys()), len(author_list), lost_label_document_id)
    print(result)
    with open(os.path.join(output_analysis, "new_train.txt"), 'w', encoding='utf-8') as f:
        for k, v in para_dict.items():
            f.write('%d\t%s\n' % (author_list[k], v))
    f.close()

二、统计训练集每篇文本的段落数，每个段落的句子数，每条句子的长度，对应的平均值、中值、众数
def task_two():
    input_train = '../pan21/validation'
    # output_train = 'data_analysis'
    corpora = glob.glob(input_train + '/*.txt')

    para_num_dict = {}
    for document_path in corpora:
        # 读取每一个文本并赋予对应id
        with open(document_path, 'r', encoding="utf-8") as file:
            document = file.read()
        share_id = os.path.basename(document_path)[8:-4]
        paragraphs = document.split('\n')
        para_num_dict[int(share_id)] = len(paragraphs)
        # for paragraph in paragraphs:
        #     sentences = split_into_sentences(paragraph)
        #     sen_num_list.append(len(sentences))
        #     for sentence in sentences:
        #         sentence = sentence.split(' ')
        #         sen_len_list.append(len(sentence))

    # para_info = get_info(para_num_list)
    # sen_num_info = get_info(sen_num_list)
    # sen_len_info = get_info(sen_len_list)
    # result = 'paragraphs information:\n{}\nnumber of sentences for each paragraph:\n{}\nsentence length:\n{}'.\
    #     format(para_info, sen_num_info, sen_len_info)
    # with open(os.path.join(output_train, "analysis2.txt"), 'w') as f:
    #     f.write(result)
    # f.close()
    print(sum(para_num_dict.values()))
    # 画图
    # author_changes = sorted(para_num_dict.items(), key=lambda e: e[0])
    # x = []
    # y = []
    # for i in range(max(para_num_dict.keys())):
    #     x.append(author_changes[i][0])
    #     y.append(author_changes[i][1])
    # plt.title("The number of paragraphs per document")
    # plt.xlabel("Document id")
    # plt.ylabel("The number of paragraphs")
    # plt.plot(x, y, 'ro')
    # x_major_locator = MultipleLocator(2000)
    # ax = plt.gca()
    # ax.xaxis.set_major_locator(x_major_locator)
    # plt.show()

三、统计一篇文章中有几位作者，画出全部文章作者数量分布图，风格变化分布图
def task_three():
    input_train = '../pan21/train'
    output_train = 'data_analysis'
    truth_label = read_ground_truth_files(input_train)
    author_num_dict = {}
    author_changes_dict = {}
    for num in range(11201):
        if num == 0:
            continue
        author_changes = truth_label[str(num)]['changes']
        author_num = truth_label[str(num)]["authors"]
        change_num = 0
        for change in author_changes:
            if change == 1:
                change_num += 1
        if change_num not in author_changes_dict.keys():
            author_changes_dict[change_num] = 1
        else:
            author_changes_dict[change_num] += 1
        if author_num not in author_num_dict.keys():
            author_num_dict[author_num] = 1
        else:
            author_num_dict[author_num] += 1
    result = 'author number distribution: \n{}\nauthor changes distribution: \n{}'.\
        format(author_num_dict, author_changes_dict)
    with open(os.path.join(output_train, "analysis3.txt"), 'w') as f3:
        f3.write(result)
    f3.close()

    # 画图
    author_changes = sorted(author_changes_dict.items(), key=lambda e: e[0])
    x = []
    y = []
    for i in range(max(author_changes_dict.keys())):
        x.append(author_changes[i][0])
        y.append(author_changes[i][1] / 11200)
    plt.title("Author changes distribution")
    plt.xlabel("Number of style changes")
    plt.ylabel("Percentage of documents")
    plt.plot(x, y, marker='o', color='r')
    x_major_locator = MultipleLocator(1)
    ax = plt.gca()
    ax.xaxis.set_major_locator(x_major_locator)
    plt.show()

四、检测训练集和验证集作者id是否有相同的
def task_four():
    input_train = '../pan21/train'
    input_validation = '../pan21/validation'
    output_analysis = 'data_analysis'
    truth_label = read_ground_truth_files(input_train)
    validation_label = read_ground_truth_files(input_validation)
    author_list_val = []
    author_list = []
    for num in range(1, 11201):
        author_ids = truth_label[str(num)]["structure"]
        for author_id in author_ids:
            if author_id not in author_list:
                author_list.append(author_id)
    for num in range(1, 2401):
        author_ids = validation_label[str(num)]["structure"]
        for author_id in author_ids:
            if author_id not in author_list:
                author_list_val.append(author_id)
    print(author_list_val)
    result = "common authors ids:{}\nnumber of author in train: {}\nnumber of author in validation: {}". \
        format(sorted(set(author_list) & set(author_list_val)), len(author_list), len(author_list_val))
    # with open(os.path.join(output_analysis, "analysis4.txt"), 'w') as f:
    #     f.write(result)
    # f.close()
    print(result)

五、统计训练集中有多少位作者，对应作者有多少段落
def task_five():
    # output_train = 'data_analysis'
    author_dict = {}
    newtrain = open('data_analysis/new_train.txt', 'r', encoding='utf-8')
    for line in newtrain.readlines():
        label, _ = line.strip().split('\t')
        label = int(label)
        if label not in author_dict.keys():
            author_dict[label] = 1
        else:
            author_dict[label] += 1

    # result = "number of authors:{}\nnumber of author's paragraphs:{}\nspecific author paragraphs:{}".\
    #     format(len(set(author_dict.keys())), get_info(list(author_dict.values())), sorted(author_dict.items()))
    # with open(os.path.join(output_train, "analysis1.txt"), 'w') as f:
    #     f.write(result)
    # f.close()

    # 画图
    author_changes = sorted(author_dict.items(), key=lambda e: e[1])
    x = []
    y = []
    for i in range(len(author_dict.keys())):
        x.append(i+1)
        y.append(author_changes[i][1])
    plt.title("The distribution of the number of paragraphs per author")
    plt.xlabel("Author id")
    plt.ylabel("The number of paragraphs")
    plt.plot(x, y, marker='o', color='r')
    x_major_locator = MultipleLocator(2500)
    ax = plt.gca()
    ax.xaxis.set_major_locator(x_major_locator)
    plt.show()

六、将总训练集里一些作者的段落取出来作为验证集
验证集大小应该为11588 = 77252*0.15，这里取大小为14552，作者个数12617(至少两个段落的作者) + 1935(只有一个段落的作者)
总训练集大小为77252，作者总数为17051
要求验证集中的这14552个作者各自的段落数小于该作者对应的总段落数，所以干脆每个作者只取一个段落作为验证集
def task_six():
    one_author_id = []
    author_dict = {}
    newtrain = open('data_analysis/new_train.txt', 'r', encoding='utf-8')
    for line in newtrain.readlines():
        la, _ = line.strip().split('\t')
        la = int(la)
        if la not in author_dict.keys():
            author_dict[la] = 1
        else:
            author_dict[la] += 1
    for k, v in author_dict.items():
        if v == 1:
            one_author_id.append(k)
    print(one_author_id)
    print(len(one_author_id))
    split_train = open('data_analysis/split_train.txt', 'a', encoding='utf-8')
    split_validation = open('data_analysis/split_validation.txt', 'a', encoding='utf-8')
    author_add = []
    count_one_author_num = 0
    split_val_num = 0
    split_train_num = 0
    with open('data_analysis/new_train.txt', 'r', encoding='utf-8') as fb:
        for line in fb.readlines():
            line_t = line.strip().split('\t')
            if int(line_t[0]) not in author_add and int(line_t[0]) not in one_author_id:
                split_validation.write(line)
                author_add.append(int(line_t[0]))
                split_val_num += 1
            elif int(line_t[0]) in one_author_id and count_one_author_num < 1935:
                count_one_author_num += 1
                split_validation.write(line)
                split_val_num += 1
                split_train_num += 1
                split_train.write(line)
            else:
                split_train_num += 1
                split_train.write(line)
    print('number of split train: %d\nnumber of split validation: %d' % (split_train_num, split_val_num))

    split_validation.close()

    split_val = open('data_analysis/split_validation.txt', 'r', encoding='utf-8')
    val_list = []
    val_ = []
    for row in split_val.readlines():
        author_id, _ = row.strip().split('\t')
        if author_id not in val_list:
            val_list.append(author_id)
        else:
            val_.append(author_id)
    print(len(val_))
    print(val_)

七、为了检测nan值的原因，将训练集切分成不同大小进行检测
def task_seven():
    split_train = open('data_analysis/split_train.txt', 'r', encoding='utf-8')
    split_small = open('data_analysis/split_small.txt', 'a', encoding='utf-8')
    train_list = split_train.readlines()
    for i in range(100):
        split_small.write(train_list.pop(0))

八、检测验证集的作者个数
def task_eight():
    split_val = open('data_analysis/split_validation.txt', 'r', encoding='utf-8')
    val_list = []
    val_ = []
    for row in split_val.readlines():
        author_id, _ = row.strip().split('\t')
        if author_id not in val_list:
            val_list.append(author_id)
        else:
            val_.append(author_id)
    print(len(val_))
    print(val_)
    print(len(set(val_list)))

九、给作者id添加分类序号
def task_nine():
    author_dict = {}
    newtrain = open('data_analysis/new_train.txt', 'r', encoding='utf-8')
    for line in newtrain.readlines():
        la, _ = line.strip().split('\t')
        la = int(la)
        if la not in author_dict.keys():
            author_dict[la] = 1
        else:
            author_dict[la] += 1
    print(get_info(list(author_dict.values())))

十、统计训练集中每一个段落的长度和对应数量，并画出分布图
def task_ten():
    # output_train = 'data_analysis'
    corpora = glob.glob('../pan21/train/*.txt')
    # corpora = glob.glob('../pan21/validation/*.txt')
    para_dict = {}
    for document_path in corpora:
        with open(document_path, 'r', encoding="utf-8") as file:
            document = file.read()
        paragraphs = document.split('\n')
        for i in range(len(paragraphs)):
            para_len = len(paragraphs[i])
            if para_len > 4000:
                continue
            if para_len not in para_dict.keys():
                para_dict[para_len] = 1
            else:
                para_dict[para_len] += 1

    # 画图
    para_dict = sorted(para_dict.items(), key=lambda _: _[0])
    x = []
    y = []
    for i in range(len(para_dict)):
        x.append(para_dict[i][0])
        y.append(para_dict[i][1])
    plt.title("Distribution of paragraphs length in training set")
    plt.xlabel("Length of paragraphs")
    plt.ylabel("Number")
    plt.plot(x, y, 'or')
    x_major_locator = MultipleLocator(200)
    ax = plt.gca()
    ax.xaxis.set_major_locator(x_major_locator)
    plt.show()
    print(get_info(y))

十一、统计验证集段落数与标签数是否一致
def eleven():
    corpora = glob.glob('../pan21/validation/*.txt')
    train_labels = read_ground_truth_files('../pan21/validation')
    para_len_list = []
    change_label_list = []
    for document_path in corpora:
        with open(document_path, 'r', encoding="utf-8") as file:
            document = file.read()
        paragraphs = document.split('\n')
        para_len = len(paragraphs)
        share_id = os.path.basename(document_path)[8:-4]
        change_labels = train_labels[share_id]['changes']
        change_label_list.append(change_labels)
        para_len_list.append(para_len)
    print(len(change_label_list))
    print(len(list(chain.from_iterable(change_label_list))))
    print(len(para_len_list))
    print(sum(para_len_list))

十二、统计一下训练集changes标签中1和0的个数
def twelve():
    train_labels = read_ground_truth_files('../pan21/validation')
    label_dict = {'1': 0, '0': 0, 'length': 0}
    task3_label_dict = {'1': 0, '0': 0, 'length': 0}
    for i in range(1, 2401):
        labels = train_labels[str(i)]['changes']
        task3labels = train_labels[str(i)]['paragraph-authors']
        task3_labels = separate_para_label(task3labels)

        label_dict['1'] += sum(labels)
        label_dict['0'] += (len(labels) - sum(labels))
        label_dict['length'] += len(labels)

        task3_label_dict['1'] += sum(task3_labels)
        task3_label_dict['0'] += (len(task3_labels) - sum(task3_labels))
        task3_label_dict['length'] += len(task3_labels)
    print(label_dict)
    print(task3_label_dict)

    # 画图
    # para_dict = sorted(author_num_dict.items(), key=lambda _: _[0])

    x = ['        changes label (14,095)', '          task3-binary label (60,365)']
    y1 = [6550, 27727]
    y2 = [7545, 32638]
    # for i in range(2):
    #     y1.append(label_dict[str(i)])
    #     y2.append(task3_label_dict[str(i)])

    plt.title('The number of 1 and 0 in validation set')
    plt.xlabel("Label")
    plt.ylabel("Number")
    # plt.yticks([500, 1000, 1500, 2000, 2500, 2800])

    width = 0.3  # 柱子的宽度
    index = np.arange(2)
    plt.bar(index, y1, width, color='steelblue', tick_label=x)
    plt.bar(index + width, y2, width, color='red')
    plt.legend(['0', '1'])

    for a, b in zip(index, y1):  # 柱子上的数字显示
        plt.text(a, b, '%d' % b, ha='center', va='bottom', fontsize=7)
    for a, b in zip(index + width, y2):
        plt.text(a, b, '%d' % b, ha='center', va='bottom', fontsize=7)

    # x_major_locator = MultipleLocator(1)
    # ax = plt.gca()
    # ax.xaxis.set_major_locator(x_major_locator)
    plt.show()

十三、画出训练集和验证集的作者数量分布图
def thirteen():
    input_train = '../pan21/validation'
    truth_label = read_ground_truth_files(input_train)
    author_num_dict = {}
    for num in range(1, 2401):
        author_num = truth_label[str(num)]["authors"]
        if author_num not in author_num_dict.keys():
            author_num_dict[author_num] = 1
        else:
            author_num_dict[author_num] += 1
    result = 'author number distribution: \n{}'.\
        format(author_num_dict)
    print(result)

    # 画图
    para_dict = sorted(author_num_dict.items(), key=lambda _: _[0])
    x = []
    y = []
    for i in range(len(para_dict)):
        x.append(para_dict[i][0])
        y.append(para_dict[i][1])

    plt.bar(x, y)
    plt.title('Distribution of author number of each document in train set')
    plt.xlabel("Author number of each document")
    plt.ylabel("Number")
    # plt.yticks([500, 1000, 1500, 2000, 2500, 2800])
    x_major_locator = MultipleLocator(1)
    ax = plt.gca()
    ax.xaxis.set_major_locator(x_major_locator)
    plt.show()

将task3标签拆分为task3-binary标签
def separate_para_label(paragraphs_label):
    separate_label = []
    for i in range(len(paragraphs_label)):
        if i == 0:
            continue
        for a in range(i):
            if paragraphs_label[a] != paragraphs_label[i]:
                separate_label.append(1)
            else:
                separate_label.append(0)
    return separate_label

把真实标签读取出来并赋予对应id，存为字典
def read_ground_truth_files(truth_folder):
    truth = {}
    for truth_file in glob.glob(os.path.join(truth_folder, 'truth-problem-*.json')):
        with open(truth_file, 'r', encoding='utf-8') as fh:
            curr_truth = json.load(fh)
            truth[os.path.basename(truth_file)[14:-5]] = curr_truth
    return truth

中值
def get_median(data):
    data = sorted(data)
    size = len(data)
    median = 0
    if size % 2 == 0:
        # 判断列表长度为偶数
        median = (data[size // 2] + data[size // 2 - 1]) / 2
    if size % 2 == 1:
        # 判断列表长度为奇数
        median = data[(size - 1) // 2]
    return median

众数(返回多个众数的平均值)
def get_most(list_):
    most = []
    item_num = dict((item, list_.count(item)) for item in list_)
    for k, v in item_num.items():
        if v == max(item_num.values()):
            most.append(k)
    mos_num = [sum(most) / len(most), max(item_num.values())]
    return mos_num

获取平均数
def get_average(list_):
    sum_ = 0
    for item in list_:
        sum_ += item
    return sum_ / len(list_)

整合相关信息
def get_info(x_list):
    max_len = max(x_list)
    min_len = min(x_list)
    avg_len = get_average(x_list)
    mos_num = get_most(x_list)
    med_len = get_median(x_list)
    print_format = '{{\n  max: {}\n  min: {}\n  avg: {}\n  most: {}\n  median: {}\n}}'. \
        format(max_len, min_len, avg_len, mos_num, med_len)
    return print_format

if __name__ == '__main__':
    # task_one()
    # task_two()
    # task_three()
    # task_four()
    # task_five()
    # task_six()
    # task_seven()
    # task_eight()
    # task_nine()
    task_ten()
    # eleven()
    # twelve()
    # thirteen()
师兄给我的一些python分析文本长度的代码
Original: https://blog.csdn.net/weixin_52634719/article/details/120178545
Author: 爱雨天
Title: Python分析文本长度和句子个数的代码参考
原创文章受到原创版权保护。转载请注明出处：https://www.johngo689.com/532011/
转载文章受原作者版权保护。转载请注明原作者出处！
2024 年 5 月
一	二	三	四	五	六	日
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31
Python分析文本长度和句子个数的代码参考

大家都在看