时间:2023-03-17来源:系统城装机大师作者:佚名
代码如下:
注意需要安装pip install sklean
;
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
from re import split from jieba.posseg import dt from sklearn.feature_extraction.text import TfidfVectorizer from collections import Counter from time import time import jieba #pip install sklean FLAGS = set ( 'a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng' .split()) def cut(text): for sentence in split( '[^a-zA-Z0-9\u4e00-\u9fa5]+' , text.strip()): for w in dt.cut(sentence): if len (w.word) > 2 and w.flag in FLAGS: yield w.word class TFIDF: def __init__( self , idf): self .idf = idf @classmethod def train( cls , texts): model = TfidfVectorizer(tokenizer = cut) model.fit(texts) idf = {w: model.idf_[i] for w, i in model.vocabulary_.items()} return cls (idf) def get_idf( self , word): return self .idf.get(word, max ( self .idf.values())) def extract( self , text, top_n = 10 ): counter = Counter() for w in cut(text): counter[w] + = self .get_idf(w) #return [i[0:2] for i in counter.most_common(top_n)] return [i[ 0 ] for i in counter.most_common(top_n)] if __name__ = = '__main__' : t0 = time() with open ( './nlp-homework.txt' , encoding = 'utf-8' )as f: _texts = f.read().strip().split( '\n' ) # print(_texts) tfidf = TFIDF.train(_texts) # print(_texts) for _text in _texts: seq_list = jieba.cut(_text,cut_all = True ) #全模式 # seq_list=jieba.cut(_text,cut_all=False) #精确模式 # seq_list=jieba.cut_for_search(_text,) #搜索引擎模式 # print(list(seq_list)) print (tfidf.extract(_text)) with open ( './resultciyun.txt' , 'a+' , encoding = 'utf-8' ) as g: for i in tfidf.extract(_text): g.write( str (i) + " " ) print (time() - t0) |
代码如下:
pip install wordcloud
;SimSun.ttf
字体,并且将这个字体包也放在和程序相同的目录下;1 2 3 4 5 6 7 8 9 10 11 |
from wordcloud import WordCloud filename = "resultciyun.txt" with open (filename) as f: resultciyun = f.read() wordcloud = WordCloud(font_path = "simsun.ttf" ).generate(resultciyun) # %pylab inline import matplotlib.pyplot as plt plt.imshow(wordcloud, interpolation = 'bilinear' ) plt.axis( "off" ) plt.show() |
最后的最后
由本人水平所限,难免有错误以及不足之处, 屏幕前的靓仔靓女们 如有发现,恳请指出!
2024-07-16
如何使用 Go 依赖库管理器修复损坏的依赖项?2024-07-07
Java框架如何简化代码的调试过程2023-03-15
Go json反序列化“null“的问题解决由于数据库的类型为Data 类型,所以插入数据库的时候我先把前端传入的string类型的时间转为Time 再插入。 Go 提供了两种插入的方式,即time.Parse 和 time.ParseInLocation 。两种方式,他们的差异比较大。...
2023-03-09