首先,我们先完成读取一篇文本文档,去除stopwords,只保留中文字符后进行分词。以下代码包括两个函数:
建议在看此部分代码之前,先看看数据集的层次结构,方便代码理解
import re,os,jieba
import numpy as np
import pandas as pd
import jieba.analyse
def extract_words_one_file(filepath,all=True):
def open_file(file_text):
with open(file_text,'r',errors='ignore',encoding='utf-8') as fp:
content=fp.readlines()
return content
def remove(text):
remove_chars=r'^\u4e00-\u9fa5'
return re.sub(remove_chars,'',text)
def open_stop(file_stop):
stopwords=[line.strip() for line in open(file_stop,'r',encoding='utf-8-sig').readlines()]
return stopwords
def seg_sentence(sentence):
sentence_seged=jieba.cut(sentence.strip())
stopwords=open_stop('data/stopwords/hit_stopwords.txt')
outstr=''
for word in sentence_seged:
if word not in stopwords:
outstr+=word
outstr+=' '
return outstr.strip()
inputs=open_file(filepath)
words_in_sentence=[]
for line in inputs:
line_delete=remove(line)
line_seg=seg_sentence(line_delete)
words_in_sentence.append(line_seg)
print('words_in_sentence_1:',words_in_sentence)
words_in_sentence=[x for x in words_in_sentence if x!='']
print('words_in_sentence_2:', words_in_sentence)
alltokens=[]
chinesewords_sentence=[]
for i in range(len(words_in_sentence)):
word=re.split(r'\s',words_in_sentence[i])
alltokens.append(word)
print('alltokens:',alltokens)
for element in alltokens:
element=[x for x in element if x!='']
chinesewords_sentence.append(element)
print('chinesewords_sentence:',chinesewords_sentence)
chinesewords_article=[i for k in chinesewords_sentence for i in k]
print('chinesewords_article:',chinesewords_article)
if all==True:
return chinesewords_article
else:
return chinesewords_sentence
def extract_words_folder(path, all=True):
files=os.listdir(path)
features=[]
for i in range(len(files)):
dirs=os.listdir(path+'/'+files[i])
for f in dirs:
if all==True:
word_single_text=extract_words_one_file(path+'/'+files[i]+'/'+f,all=True)
word_with_label = [word_single_text, files[i], f]
features.append(word_with_label)
else:
word_single_text = extract_words_one_file(path + "/" + files[i] + "/" + f, all=False)
features.append(word_single_text)
if all == True:
return pd.DataFrame(features, columns=['Words', 'Category', 'File'])
else:
return features
article_features = extract_words_folder(path='data/fudan-utf8/train',all = True)
article_features.to_csv("article_features_train_raw.csv",encoding='utf_8',index=False)
sent_features = extract_words_folder(path='data/fudan-utf8/train',all = False)
with open("word_sentence_train.txt", "w", encoding='utf-8') as f:
f.write(str(sent_features))
为了方便之后数据的一致性,我没有将所有的数据都一并保存,之后再划分训练集测试集,而是之前手动划分,自己创建了一个test的文件夹,分别提取训练集和测试集的数据,然后分别存入了两个csv文件当中。我在划分的时候也发现,不同类别的样本数量差别很大,有的上千,有的还不过百。因此,我只选取了其中最多的九个类别,去除了剩余的11个类别。因为这些样本数量过少的类别相比于其他悬殊的样本数量,很难让模型作出无偏见的判定。
import pandas as pd
import numpy as np
train=pd.read_csv('data/article_features_train_raw.csv')
test=pd.read_csv('data/article_features_test_raw.csv')
train.Category.replace('C31-Enviornment','C31-Environment',inplace=True)
train=train[(train['Category'] == 'C3-Art')|(train['Category'] == 'C11-Space')|(train['Category'] == 'C19-Computer')
|(train['Category'] == 'C31-Environment')|(train['Category'] == 'C32-Agriculture')
|(train['Category'] == 'C34-Economy')|(train['Category'] == 'C38-Politics')|(train['Category'] == 'C39-Sports')
|(train['Category'] == 'C7-History')]
test = test[(test['Category'] == 'C3-Art')|(test['Category'] == 'C11-Space')|(test['Category'] == 'C19-Computer')
|(test['Category'] =='C31-Environment')|(test['Category'] == 'C32-Agriculture')
|(test['Category'] == 'C34-Economy')|(test['Category'] == 'C38-Politics')|(test['Category'] == 'C39-Sports')
|(test['Category'] == 'C7-History')]
label2category = {0: 'C11-Space', 1: 'C19-Computer', 2: 'C3-Art', 3: 'C31-Environment', 4: 'C32-Agriculture',
5: 'C34-Economy', 6:'C38-Politics',7:'C39-Sports',8:'C7-History'}
category2label = dict(zip(label2category.values(), label2category.keys()))
train['label'] = train.Category.replace(category2label)
test['label'] = test.Category.replace(category2label)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train.to_csv("article_features_train.csv",encoding='utf_8_sig',index=False)
test.to_csv("article_features_test.csv",encoding='utf_8_sig',index=False)
分词提取完毕后,我们需要用一定的方式表示这些分词,将这些计算机无法处理的非结构化信息转化为可计算的结构化信息。one-hot方法是其中之一,它的原理很好理解,首先生成一个初始值为零的长度为所有词的列表。若文章或句子包含某词,则该词对应位置为1,否则为0。但是其在表示大规模文本的时候往往出现过于稀疏的现象,也无法表示词语与词语间的关系,所以只是在比较简单的实践中使用。
本文我们所采取的文本表示方法是词嵌入中的Word2Vec。通俗的来说,就是根据训练将分词用多维向量表示。其2种训练模式为通过上下文来预测当前词和通过当前词来预测上下文。
gensim版本为4.1.2
import gensim
with open("word_sentence.txt", "r") as f:
word_sentence = f.read()
sent_feature = eval(word_sentence)
sent_words = [i for k in sent_feature for i in k if len(i)>3]
在第二部分中,我们已经将分词按句子形式保存到了文本文件中,现在,我们便利用它直接进行训练获得模型。
model = gensim.models.Word2Vec(sent_words, sg=1, size=100, window=3,iter=5,
min_count=3, negative=3, sample=0.001, hs=1)
model.wv.save_word2vec_format('./word2vec_model.txt', binary=False)
w2v_model=gensim.models.Word2Vec(vector_size=300,window=3,sg=1,min_count=3)
w2v_model.build_vocab(sent_words)
third_model=gensim.models.KeyedVectors.load_word2vec_format('data/fudan-utf8/sgns.merge.word',binary=False)
w2v_model.build_vocab([list(third_model.vocab.key_to_index())],update=True)
w2v_model.intersect_word2vec_format('data/fudan-utf8/sgns.merge.word',binary=False,lockf=1.0)
w2v_model.train(sent_words, total_examples=w2v_model.corpus_count, epochs=5)
print("Model training finished.")
w2v_model.wv.save_word2vec_format('./word2vec_ensemble.txt', binary=False)
print("Model saved.")
Original: https://blog.csdn.net/qq_42801194/article/details/121535761
Author: Vincy_King
Title: 中文分词(上)——获取和Word2Vec模型构建
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/544639/
转载文章受原作者版权保护。转载请注明原作者出处!