文本数据分析

1.语料库jieba、NLTK

NLTK库用来处理英文(英文指定空格为分词标记,调用word_tokenize()函数,基于空格或标点对文本进行分词,并返回单词列表)、jieba库处理中文(只有明显的字句段能通过明显的分解符来进行简单的划分)

2.分词

  • 精确分词,试图将句子最精确的切开,适合文本分析
  • 全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义
  • 搜索引擎模式,在精确模式基础上,对长词进行再次切分,提高recall,适合于搜索引擎。
#jieba分词
sentence2="传智专修学院推出颠覆式办学模式"
#全模式划分
terms_list=jieba.cut(sentence2,cut_all=True)
print("Full Mode: " + ", ".join(terms_list))
#精确模式划分
terms_list=jieba.cut(sentence2,cut_all=False)
print("Default Mode:"+",".join(terms_list))
terms_list=jieba.cut(sentence2)
print("默认模式:"+",".join(terms_list))
terms_list=jieba.cut_for_search(sentence2)
print("搜索引擎模式:"+",".join(terms_list))
分词结果
#加载自定义词典
#jieba.load_userdict("user_dict.txt")
#词性标注
words=nltk.word_tokenize(sentence)
nltk.pos_tag(words)
#词性归一化
#导入nltk.stem模块的波特词干提取器
from nltk.stem.porter import PorterStemmer
#按照波特算法提取词干
porter_stem=PorterStemmer()
porter_stem.stem('watched')

#兰卡斯特词干提取器
from nltk.stem.lancaster import LancasterStemmer
lancaster_stem=LancasterStemmer()
lancaster_stem.stem('watched')

#基于WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
wordnet_lem=WordNetLemmatizer()
wordnet_lem.lemmatize('watched',pos='v')
from nltk.stem.porter import PorterStemmer
#按照波特算法提取词干
porter_stem=PorterStemmer()
for word in words:
    print(porter_stem.stem(word))
词性标注
#删除停用词
from nltk.corpus import stopwords
stop_words=stopwords.words('english')
#定义一个空列表
remain_words=[]
#如果发现单词不包含在停用词列表中,就保存在remain_words中
for word in words:
    if word not in stop_words:
        remain_words.append(word)
remain_words
删除停用词

3.英文项目

text_one='This is a wonderful book'
text_two='I like reading this book very much.'
text_thr='This book reads well.'
text_fou='This book is not good.'
text_fiv='This is a very bad book.'
import nltk
#基于WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
def pret_text(text):
    #对文本进行分词
    words=nltk.word_tokenize(text)
    #词性还原
    wordnet_lematizer=WordNetLemmatizer()
    words=[wordnet_lematizer.lemmatize(word)for word in words]
    #删除停用词
    remain_word=[word for word in words if word not in stopwords.words('english')]
    #True表示该词在文本中
    return {word:True for word in remain_words}
#构建训练文本,设定情感分值
train_data=[[pret_text(text_one),1],
           [pret_text(text_two),1],
           [pret_text(text_thr),1],
           [pret_text(text_fou),-1],
           [pret_text(text_fiv),-1]]
#训练模型
demo_model=NaiveBayesClassifier.train(train_data)
print(demo_model)
#测试模型
test_text1='I like this movie very much'
demo_model.classify(pret_text(test_text1))
test_text2='The film is very bad'
demo_model.classify(pret_text(test_text2))
test_text3='The film is terrible'
demo_model.classify(pret_text(test_text3))
结果

4淘宝店铺商品评价分析

#爬取商品评价代码
import pandas as pd
import nltk
from nltk.probability import  FreqDist,ConditionalFreqDist
import jieba 
file_data=pd.read_excel(r'C:\Users\hq\Desktop\商品评价.xlsx')
file_data
爬取的评论
#删除重复评价
file_data=file_data.drop_duplicates()
file_data
去重之后的评论
#使用精确模式划分中文句子
cut_words=jieba.lcut(str(file_data['商品评价'].values),cut_all=False)
cut_words
分词结果
#加载停用词表
file_path=open(r'C:\Users\hq\Desktop\停用词.txt',encoding='utf-8')
stop_words=file_path.read()
new_data=[]
for word in cut_words:
    if word not in stop_words:
        new_data.append(word)
new_data
删除停用词
#词频统计
freq_list=FreqDist(new_data)
#返回词语列表
most_common_words=freq_list.most_common()
most_common_words
词频统计
from matplotlib import pyplot as plt
from wordcloud import WordCloud
#词云显示
font=r'C:\Windows\Fonts\STXINGKA.TTF'
wc=WordCloud(font_path=font,background_color='white',width=1000,height=800).generate(" ".join(new_data))
plt.imshow(wc)
plt.axis('off')
plt.show()
词云显示

You may also like...

发表评论

您的电子邮箱地址不会被公开。 必填项已用*标注