第1关 LDA算法
import jieba
import jieba.analyse as analyse
import gensim
from gensim import corpora, models, similarities
def get_stopword_list():
stop_word_path = './stopword.txt'
stopword_list = [sw.replace('\n', '') for sw in open(stop_word_path, encoding='utf-8').readlines()]
return stopword_list
stop_word = get_stopword_list()
text = input()
sentences = []
segs = jieba.lcut(text)
segs = list(filter(lambda x: x not in stop_word, segs))
sentences.append(segs)
dictionary = corpora.Dictionary(sentences)
corpus = [dictionary.doc2bow(sentence) for sentence in sentences]
result = ""
lda = models.LdaModel(corpus,id2word=dictionary, num_topics=1)
for result in lda.print_topics(num_words=1):
print(result[1].split('*')[1],end="")
注:这一题的输出没有很符合我的预期,所以我干脆直接改了他的print输出,用自己更喜欢的方式输出
第1关:去除停用词
def get_stopword_list():
stop_word_path = './stopword.txt'
stopword_list = [sw.replace('\n', '') for sw in open(stop_word_path,encoding='utf-8').readlines()]
return stopword_list
if __name__ == '__main__':
text=input()
result=""
stopwords = get_stopword_list()
for word in text:
if word not in stopwords:
result += word + ""
print(result,end="")
TF/IDF算法
import math
import jieba
import jieba.posseg as psg
from gensim import corpora, models
from jieba import analyse
import functools
from collections import defaultdict
class TfIdf(object):
def __init__(self, idf_dic, default_idf, word_list, keyword_num):
self.word_list = word_list
self.idf_dic, self.default_idf = idf_dic, default_idf
self.tf_dic = self.get_tf_dic()
self.keyword_num = keyword_num
def get_tf_dic(self):
tf_dic = {}
doc_frequency=defaultdict(int)
for i in self.word_list:
doc_frequency[i]+=1
for i in doc_frequency:
tf_dic[i]=doc_frequency[i]/sum(doc_frequency.values())
return tf_dic
def get_tfidf(self):
tfidf_dic = {}
for word in self.word_list:
idf = self.idf_dic.get(word, self.default_idf)
tf = self.tf_dic.get(word, 0)
tfidf = tf * idf
tfidf_dic[word] = tfidf
tfidf_dic.items()
for k, v in sorted(tfidf_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]:
print(k + "/ ", end='')
print()
def cmp(e1, e2):
import numpy as np
res = np.sign(e1[1] - e2[1])
if res != 0:
return res
else:
a = e1[0] + e2[0]
b = e2[0] + e1[0]
if a > b:
return 1
elif a == b:
return 0
else:
return -1
注: 这里对字典的统计我引入了defaultdict函数(这个函数是用来新建一个键值对的),算是额外引入了一个算法库使用
第1关 Jieba 在关键词提取中的应用
import jieba.analyse
import warnings
warnings.filterwarnings("ignore")
sentence = input()
result = ''
text = ""
stopwords = "1000 , 。防控 审查 不 项目 支付 省住 销售 返还 佣"
for word in sentence:
if word not in stopwords:
text += word + ""
text += "何靖"
words = jieba.analyse.extract_tags(text, topK=3, withWeight= False, allowPOS=())
for word in words:
result += word + " "
print(result)
测试用例:
一、
针对集体宿舍人员如何科学防控的问题,中国疾控中心环境所所长施小明表示,要加强日常体温检测,对进入集体宿舍人员进行体温检测,发现体温异常人员要立即将其转移至临时隔离区域,并按相关规定进行处置。同时严控集体宿舍住宿人数超标问题,设置可开启窗户定时通风。
二、
以上信息提示,武汉疫情快速上升态势得到控制,湖北除武汉外,局部爆发的态势也得到控制,湖北以外省份疫情形势积极向好。下一步要从统筹推进疫情防控和经济社会发展出发,紧紧围绕社区防控和医疗救治两个重点,由全面防控向群专结合,精准防控转变。
三、
为推动项目尽快开工建设,省住建厅加大政策支持力度,允许施工图容缺受理审查,帮助业主提前开展施工图审查。各施工图审查机构充分发挥主观能动性,创造条件满足项目建设需求,确保施工图审查不接触、不间断、不延误。同时,加快支付施工图审查购买服务费,疫情期间,各级财政、住建部门购买并支付施工图审查服务费974万元,有效缓解审查机构资金成本压力。
五、
何靖所在的公司一直有一个”老带新”的推荐系统,想赚佣金的同事就参与。不过何靖表示,自己和同事平时工作量已经很饱和,且个人朋友圈子也不属于客户群,所以平时积极性不高。现在疫情来了,集团层面推出了”员工千元预定房源”的活动,员工每人交1000元预定一套房源,这套房源由个人出去销售,成交后,公司除了返还1000元预定金之外,还会支付和销售岗位同等比例的佣金,并额外补偿1000元;未成交,也会返还预定金。
注:这道题我是真没写出来,这个参数太难调了,使用了停用表将高频词删掉才通过的,应该有一套参数可以完美运行的;我这种方式只适合部分用例,通用性不强
第二关 TextRank算法
from jieba import analyse
import jieba.analyse
text = input()
result = ''
words = jieba.analyse.textrank(text, topK=3, withWeight=False, allowPOS=("ns","n","vn","v"))
for word in words:
result += word + " "
print(result)
第一关 情感分析基础
from snownlp import SnowNLP
def count_sno():
count = 0
a = open('./step1/test.txt',encoding='utf-8')
listOfLines = a.readlines()
for lineWords in listOfLines:
text = SnowNLP(lineWords)
if(text.sentiments > 0.9):
count += 1
return count
第2关 帖子好评度分析
import re
from pyquery import PyQuery
from snownlp import SnowNLP
def evaluate(path):
html = open(path,'r',encoding="utf-8")
score, count = 0, 0
code = html.read()
pq = PyQuery(code)
tag = pq('div.text')
taga = pq('a')
words = ""
for word in tag.text():
if word not in taga.text():
words += word
txtlist = words.split(':')
txtlist.pop(0)
for txt in txtlist:
count += 1
s = SnowNLP(txt)
score += s.sentiments
return int(score*100/count)
注:这道题的比上道题的难点在于对html这个的解析工作,我这里采用的pyquery库进行解析,为了精准拆分还用了比较蠢的方法去实现,可能有更加有效的库去解决这个问题
Original: https://blog.csdn.net/Albert_weiku/article/details/127486022
Author: AlbertOS
Title: 头歌平台-人工智能技术应用-实践学习与答案2(补充实训部分)
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/651179/
转载文章受原作者版权保护。转载请注明原作者出处!